OpenGL interop is simplified when the cl_context is not created by SKC.

Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built.

Bug: skia:
Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b
Reviewed-on: https://skia-review.googlesource.com/136608
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
diff --git a/src/compute/common/cl/find_cl.c b/src/compute/common/cl/find_cl.c
index 613d2b9..43b26d1 100644
--- a/src/compute/common/cl/find_cl.c
+++ b/src/compute/common/cl/find_cl.c
@@ -20,7 +20,7 @@
 
 #include "find_cl.h"
 #include "assert_cl.h"
-#include "macros.h"
+#include "../macros.h"
 
 //
 // search platforms and devices for a match
diff --git a/src/compute/common/macros.h b/src/compute/common/macros.h
index d91a000..52dc868 100644
--- a/src/compute/common/macros.h
+++ b/src/compute/common/macros.h
@@ -38,9 +38,9 @@
 //
 
 #if defined(_MSC_VER)
-    #define ALLOCA(n)  _alloca(n)
+#define ALLOCA(n)  _alloca(n)
 #else
-    #define ALLOCA(n) alloca(n)
+#define ALLOCA(n) alloca(n)
 #endif
 //
 //
diff --git a/src/compute/hs/cl/gen9/hs_cl.cl b/src/compute/hs/cl/gen9/hs_cl.cl
new file mode 100644
index 0000000..63627ad
--- /dev/null
+++ b/src/compute/hs/cl/gen9/hs_cl.cl
@@ -0,0 +1,10082 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#include <hs_cl_macros.h>
+
+//
+//
+//
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_transpose(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8];
+  HS_TRANSPOSE_SLAB()
+}
+
+__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bs_4(__global HS_KEY_TYPE const* const restrict vin,
+               __global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 128];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r6, r11)
+  HS_CMP_XCHG(r7, r10)
+  HS_CMP_XCHG(r4, r13)
+  HS_CMP_XCHG(r14, r15)
+  HS_CMP_XCHG(r8, r12)
+  HS_CMP_XCHG(r2, r3)
+  HS_CMP_XCHG(r5, r9)
+  HS_CMP_XCHG(r2, r5)
+  HS_CMP_XCHG(r8, r14)
+  HS_CMP_XCHG(r3, r9)
+  HS_CMP_XCHG(r12, r15)
+  HS_CMP_XCHG(r3, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r14)
+  HS_CMP_XCHG(r4, r9)
+  HS_CMP_XCHG(r8, r13)
+  HS_CMP_XCHG(r7, r9)
+  HS_CMP_XCHG(r11, r13)
+  HS_CMP_XCHG(r4, r6)
+  HS_CMP_XCHG(r8, r10)
+  HS_CMP_XCHG(r4, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r8, r9)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r13)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  {
+    uint const flip_lane_mask = 1;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 3;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 7;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 2;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id();
+  uint const smem_r_idx =
+    (get_sub_group_id() ^ 1) * 128 + (get_sub_group_local_id() ^ 7);
+  (shared.m + get_local_id(0))[16 * 8 * 0] = r1;
+  (shared.m + get_local_id(0))[16 * 8 * 1] = r16;
+  (shared.m + get_local_id(0))[16 * 8 * 2] = r2;
+  (shared.m + get_local_id(0))[16 * 8 * 3] = r15;
+  (shared.m + get_local_id(0))[16 * 8 * 4] = r3;
+  (shared.m + get_local_id(0))[16 * 8 * 5] = r14;
+  (shared.m + get_local_id(0))[16 * 8 * 6] = r4;
+  (shared.m + get_local_id(0))[16 * 8 * 7] = r13;
+  (shared.m + get_local_id(0))[16 * 8 * 8] = r5;
+  (shared.m + get_local_id(0))[16 * 8 * 9] = r12;
+  (shared.m + get_local_id(0))[16 * 8 * 10] = r6;
+  (shared.m + get_local_id(0))[16 * 8 * 11] = r11;
+  (shared.m + get_local_id(0))[16 * 8 * 12] = r7;
+  (shared.m + get_local_id(0))[16 * 8 * 13] = r10;
+  (shared.m + get_local_id(0))[16 * 8 * 14] = r8;
+  (shared.m + get_local_id(0))[16 * 8 * 15] = r9;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  {
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_r_idx)[8] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[16] = r1_1;
+      (shared.m + smem_r_idx)[24] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32];
+      HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40];
+      HS_CMP_XCHG(r2_1, r2_2)
+      (shared.m + smem_l_idx)[32] = r2_1;
+      (shared.m + smem_r_idx)[40] = r2_2;
+    }
+    {
+      HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48];
+      HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56];
+      HS_CMP_XCHG(r3_1, r3_2)
+      (shared.m + smem_l_idx)[48] = r3_1;
+      (shared.m + smem_r_idx)[56] = r3_2;
+    }
+    {
+      HS_KEY_TYPE r4_1 = (shared.m + smem_l_idx)[64];
+      HS_KEY_TYPE r4_2 = (shared.m + smem_r_idx)[72];
+      HS_CMP_XCHG(r4_1, r4_2)
+      (shared.m + smem_l_idx)[64] = r4_1;
+      (shared.m + smem_r_idx)[72] = r4_2;
+    }
+    {
+      HS_KEY_TYPE r5_1 = (shared.m + smem_l_idx)[80];
+      HS_KEY_TYPE r5_2 = (shared.m + smem_r_idx)[88];
+      HS_CMP_XCHG(r5_1, r5_2)
+      (shared.m + smem_l_idx)[80] = r5_1;
+      (shared.m + smem_r_idx)[88] = r5_2;
+    }
+    {
+      HS_KEY_TYPE r6_1 = (shared.m + smem_l_idx)[96];
+      HS_KEY_TYPE r6_2 = (shared.m + smem_r_idx)[104];
+      HS_CMP_XCHG(r6_1, r6_2)
+      (shared.m + smem_l_idx)[96] = r6_1;
+      (shared.m + smem_r_idx)[104] = r6_2;
+    }
+    {
+      HS_KEY_TYPE r7_1 = (shared.m + smem_l_idx)[112];
+      HS_KEY_TYPE r7_2 = (shared.m + smem_r_idx)[120];
+      HS_CMP_XCHG(r7_1, r7_2)
+      (shared.m + smem_l_idx)[112] = r7_1;
+      (shared.m + smem_r_idx)[120] = r7_2;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
+  r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
+  r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
+  r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
+  r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
+  r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
+  r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
+  r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
+  r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
+  r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
+  r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
+  r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
+  r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
+  r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
+  r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
+  r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_r_idx)[16] = r0_3;
+    (shared.m + smem_r_idx)[24] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32];
+    HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40];
+    HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48];
+    HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56];
+    HS_CMP_XCHG(r1_2, r1_3)
+    HS_CMP_XCHG(r1_1, r1_4)
+    HS_CMP_XCHG(r1_3, r1_4)
+    HS_CMP_XCHG(r1_1, r1_2)
+    (shared.m + smem_l_idx)[32] = r1_1;
+    (shared.m + smem_l_idx)[40] = r1_2;
+    (shared.m + smem_r_idx)[48] = r1_3;
+    (shared.m + smem_r_idx)[56] = r1_4;
+  }
+  {
+    HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[64];
+    HS_KEY_TYPE r2_2 = (shared.m + smem_l_idx)[72];
+    HS_KEY_TYPE r2_3 = (shared.m + smem_r_idx)[80];
+    HS_KEY_TYPE r2_4 = (shared.m + smem_r_idx)[88];
+    HS_CMP_XCHG(r2_2, r2_3)
+    HS_CMP_XCHG(r2_1, r2_4)
+    HS_CMP_XCHG(r2_3, r2_4)
+    HS_CMP_XCHG(r2_1, r2_2)
+    (shared.m + smem_l_idx)[64] = r2_1;
+    (shared.m + smem_l_idx)[72] = r2_2;
+    (shared.m + smem_r_idx)[80] = r2_3;
+    (shared.m + smem_r_idx)[88] = r2_4;
+  }
+  {
+    HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[96];
+    HS_KEY_TYPE r3_2 = (shared.m + smem_l_idx)[104];
+    HS_KEY_TYPE r3_3 = (shared.m + smem_r_idx)[112];
+    HS_KEY_TYPE r3_4 = (shared.m + smem_r_idx)[120];
+    HS_CMP_XCHG(r3_2, r3_3)
+    HS_CMP_XCHG(r3_1, r3_4)
+    HS_CMP_XCHG(r3_3, r3_4)
+    HS_CMP_XCHG(r3_1, r3_2)
+    (shared.m + smem_l_idx)[96] = r3_1;
+    (shared.m + smem_l_idx)[104] = r3_2;
+    (shared.m + smem_r_idx)[112] = r3_3;
+    (shared.m + smem_r_idx)[120] = r3_4;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
+    HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32];
+    HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40];
+    HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48];
+    HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56];
+    HS_CMP_XCHG(r0_4, r0_5)
+    HS_CMP_XCHG(r0_3, r0_6)
+    HS_CMP_XCHG(r0_2, r0_7)
+    HS_CMP_XCHG(r0_1, r0_8)
+    HS_CMP_XCHG(r0_5, r0_7)
+    HS_CMP_XCHG(r0_6, r0_8)
+    HS_CMP_XCHG(r0_5, r0_6)
+    HS_CMP_XCHG(r0_7, r0_8)
+    HS_CMP_XCHG(r0_1, r0_3)
+    HS_CMP_XCHG(r0_2, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    HS_CMP_XCHG(r0_3, r0_4)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_l_idx)[16] = r0_3;
+    (shared.m + smem_l_idx)[24] = r0_4;
+    (shared.m + smem_r_idx)[32] = r0_5;
+    (shared.m + smem_r_idx)[40] = r0_6;
+    (shared.m + smem_r_idx)[48] = r0_7;
+    (shared.m + smem_r_idx)[56] = r0_8;
+  }
+  {
+    HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[64];
+    HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[72];
+    HS_KEY_TYPE r1_3 = (shared.m + smem_l_idx)[80];
+    HS_KEY_TYPE r1_4 = (shared.m + smem_l_idx)[88];
+    HS_KEY_TYPE r1_5 = (shared.m + smem_r_idx)[96];
+    HS_KEY_TYPE r1_6 = (shared.m + smem_r_idx)[104];
+    HS_KEY_TYPE r1_7 = (shared.m + smem_r_idx)[112];
+    HS_KEY_TYPE r1_8 = (shared.m + smem_r_idx)[120];
+    HS_CMP_XCHG(r1_4, r1_5)
+    HS_CMP_XCHG(r1_3, r1_6)
+    HS_CMP_XCHG(r1_2, r1_7)
+    HS_CMP_XCHG(r1_1, r1_8)
+    HS_CMP_XCHG(r1_5, r1_7)
+    HS_CMP_XCHG(r1_6, r1_8)
+    HS_CMP_XCHG(r1_5, r1_6)
+    HS_CMP_XCHG(r1_7, r1_8)
+    HS_CMP_XCHG(r1_1, r1_3)
+    HS_CMP_XCHG(r1_2, r1_4)
+    HS_CMP_XCHG(r1_1, r1_2)
+    HS_CMP_XCHG(r1_3, r1_4)
+    (shared.m + smem_l_idx)[64] = r1_1;
+    (shared.m + smem_l_idx)[72] = r1_2;
+    (shared.m + smem_l_idx)[80] = r1_3;
+    (shared.m + smem_l_idx)[88] = r1_4;
+    (shared.m + smem_r_idx)[96] = r1_5;
+    (shared.m + smem_r_idx)[104] = r1_6;
+    (shared.m + smem_r_idx)[112] = r1_7;
+    (shared.m + smem_r_idx)[120] = r1_8;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[16 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[16 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[16 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[16 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[16 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[16 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[16 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[16 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[16 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[16 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[16 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[16 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[16 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[16 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[16 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[16 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
+    HS_KEY_TYPE r0_5 = (shared.m + smem_l_idx)[32];
+    HS_KEY_TYPE r0_6 = (shared.m + smem_l_idx)[40];
+    HS_KEY_TYPE r0_7 = (shared.m + smem_l_idx)[48];
+    HS_KEY_TYPE r0_8 = (shared.m + smem_l_idx)[56];
+    HS_KEY_TYPE r0_9 = (shared.m + smem_r_idx)[64];
+    HS_KEY_TYPE r0_10 = (shared.m + smem_r_idx)[72];
+    HS_KEY_TYPE r0_11 = (shared.m + smem_r_idx)[80];
+    HS_KEY_TYPE r0_12 = (shared.m + smem_r_idx)[88];
+    HS_KEY_TYPE r0_13 = (shared.m + smem_r_idx)[96];
+    HS_KEY_TYPE r0_14 = (shared.m + smem_r_idx)[104];
+    HS_KEY_TYPE r0_15 = (shared.m + smem_r_idx)[112];
+    HS_KEY_TYPE r0_16 = (shared.m + smem_r_idx)[120];
+    HS_CMP_XCHG(r0_8, r0_9)
+    HS_CMP_XCHG(r0_7, r0_10)
+    HS_CMP_XCHG(r0_6, r0_11)
+    HS_CMP_XCHG(r0_5, r0_12)
+    HS_CMP_XCHG(r0_4, r0_13)
+    HS_CMP_XCHG(r0_3, r0_14)
+    HS_CMP_XCHG(r0_2, r0_15)
+    HS_CMP_XCHG(r0_1, r0_16)
+    HS_CMP_XCHG(r0_9, r0_13)
+    HS_CMP_XCHG(r0_11, r0_15)
+    HS_CMP_XCHG(r0_9, r0_11)
+    HS_CMP_XCHG(r0_13, r0_15)
+    HS_CMP_XCHG(r0_10, r0_14)
+    HS_CMP_XCHG(r0_12, r0_16)
+    HS_CMP_XCHG(r0_10, r0_12)
+    HS_CMP_XCHG(r0_14, r0_16)
+    HS_CMP_XCHG(r0_9, r0_10)
+    HS_CMP_XCHG(r0_11, r0_12)
+    HS_CMP_XCHG(r0_13, r0_14)
+    HS_CMP_XCHG(r0_15, r0_16)
+    HS_CMP_XCHG(r0_1, r0_5)
+    HS_CMP_XCHG(r0_3, r0_7)
+    HS_CMP_XCHG(r0_1, r0_3)
+    HS_CMP_XCHG(r0_5, r0_7)
+    HS_CMP_XCHG(r0_2, r0_6)
+    HS_CMP_XCHG(r0_4, r0_8)
+    HS_CMP_XCHG(r0_2, r0_4)
+    HS_CMP_XCHG(r0_6, r0_8)
+    HS_CMP_XCHG(r0_1, r0_2)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_5, r0_6)
+    HS_CMP_XCHG(r0_7, r0_8)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_l_idx)[16] = r0_3;
+    (shared.m + smem_l_idx)[24] = r0_4;
+    (shared.m + smem_l_idx)[32] = r0_5;
+    (shared.m + smem_l_idx)[40] = r0_6;
+    (shared.m + smem_l_idx)[48] = r0_7;
+    (shared.m + smem_l_idx)[56] = r0_8;
+    (shared.m + smem_r_idx)[64] = r0_9;
+    (shared.m + smem_r_idx)[72] = r0_10;
+    (shared.m + smem_r_idx)[80] = r0_11;
+    (shared.m + smem_r_idx)[88] = r0_12;
+    (shared.m + smem_r_idx)[96] = r0_13;
+    (shared.m + smem_r_idx)[104] = r0_14;
+    (shared.m + smem_r_idx)[112] = r0_15;
+    (shared.m + smem_r_idx)[120] = r0_16;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[16 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[16 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[16 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[16 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[16 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[16 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[16 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[16 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[16 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[16 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[16 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[16 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[16 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[16 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[16 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bs_3(__global HS_KEY_TYPE const* const restrict vin,
+               __global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 64];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r6, r11)
+  HS_CMP_XCHG(r7, r10)
+  HS_CMP_XCHG(r4, r13)
+  HS_CMP_XCHG(r14, r15)
+  HS_CMP_XCHG(r8, r12)
+  HS_CMP_XCHG(r2, r3)
+  HS_CMP_XCHG(r5, r9)
+  HS_CMP_XCHG(r2, r5)
+  HS_CMP_XCHG(r8, r14)
+  HS_CMP_XCHG(r3, r9)
+  HS_CMP_XCHG(r12, r15)
+  HS_CMP_XCHG(r3, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r14)
+  HS_CMP_XCHG(r4, r9)
+  HS_CMP_XCHG(r8, r13)
+  HS_CMP_XCHG(r7, r9)
+  HS_CMP_XCHG(r11, r13)
+  HS_CMP_XCHG(r4, r6)
+  HS_CMP_XCHG(r8, r10)
+  HS_CMP_XCHG(r4, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r8, r9)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r13)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  {
+    uint const flip_lane_mask = 1;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 3;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 7;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 2;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id();
+  uint const smem_r_idx =
+    (get_sub_group_id() ^ 1) * 64 + (get_sub_group_local_id() ^ 7);
+  (shared.m + get_local_id(0))[8 * 8 * 0] = r1;
+  (shared.m + get_local_id(0))[8 * 8 * 1] = r16;
+  (shared.m + get_local_id(0))[8 * 8 * 2] = r2;
+  (shared.m + get_local_id(0))[8 * 8 * 3] = r15;
+  (shared.m + get_local_id(0))[8 * 8 * 4] = r3;
+  (shared.m + get_local_id(0))[8 * 8 * 5] = r14;
+  (shared.m + get_local_id(0))[8 * 8 * 6] = r4;
+  (shared.m + get_local_id(0))[8 * 8 * 7] = r13;
+  (shared.m + get_local_id(0))[8 * 8 * 8] = r5;
+  (shared.m + get_local_id(0))[8 * 8 * 9] = r12;
+  (shared.m + get_local_id(0))[8 * 8 * 10] = r6;
+  (shared.m + get_local_id(0))[8 * 8 * 11] = r11;
+  (shared.m + get_local_id(0))[8 * 8 * 12] = r7;
+  (shared.m + get_local_id(0))[8 * 8 * 13] = r10;
+  (shared.m + get_local_id(0))[8 * 8 * 14] = r8;
+  (shared.m + get_local_id(0))[8 * 8 * 15] = r9;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  {
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_r_idx)[8] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[16] = r1_1;
+      (shared.m + smem_r_idx)[24] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[32];
+      HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[40];
+      HS_CMP_XCHG(r2_1, r2_2)
+      (shared.m + smem_l_idx)[32] = r2_1;
+      (shared.m + smem_r_idx)[40] = r2_2;
+    }
+    {
+      HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[48];
+      HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[56];
+      HS_CMP_XCHG(r3_1, r3_2)
+      (shared.m + smem_l_idx)[48] = r3_1;
+      (shared.m + smem_r_idx)[56] = r3_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[520];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[512] = r0_1;
+      (shared.m + smem_r_idx)[520] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[528];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[536];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[528] = r1_1;
+      (shared.m + smem_r_idx)[536] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r2_1 = (shared.m + smem_l_idx)[544];
+      HS_KEY_TYPE r2_2 = (shared.m + smem_r_idx)[552];
+      HS_CMP_XCHG(r2_1, r2_2)
+      (shared.m + smem_l_idx)[544] = r2_1;
+      (shared.m + smem_r_idx)[552] = r2_2;
+    }
+    {
+      HS_KEY_TYPE r3_1 = (shared.m + smem_l_idx)[560];
+      HS_KEY_TYPE r3_2 = (shared.m + smem_r_idx)[568];
+      HS_CMP_XCHG(r3_1, r3_2)
+      (shared.m + smem_l_idx)[560] = r3_1;
+      (shared.m + smem_r_idx)[568] = r3_2;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
+  r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
+  r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
+  r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
+  r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
+  r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
+  r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
+  r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
+  r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
+  r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
+  r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
+  r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
+  r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
+  r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
+  r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
+  r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[8 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[8 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[8 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[8 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[8 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[8 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[8 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[8 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[8 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[8 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[8 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[8 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[8 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[8 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[8 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[8 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_r_idx)[16] = r0_3;
+    (shared.m + smem_r_idx)[24] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[32];
+    HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[40];
+    HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[48];
+    HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[56];
+    HS_CMP_XCHG(r1_2, r1_3)
+    HS_CMP_XCHG(r1_1, r1_4)
+    HS_CMP_XCHG(r1_3, r1_4)
+    HS_CMP_XCHG(r1_1, r1_2)
+    (shared.m + smem_l_idx)[32] = r1_1;
+    (shared.m + smem_l_idx)[40] = r1_2;
+    (shared.m + smem_r_idx)[48] = r1_3;
+    (shared.m + smem_r_idx)[56] = r1_4;
+  }
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[528];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[536];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[512] = r0_1;
+    (shared.m + smem_l_idx)[520] = r0_2;
+    (shared.m + smem_r_idx)[528] = r0_3;
+    (shared.m + smem_r_idx)[536] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[544];
+    HS_KEY_TYPE r1_2 = (shared.m + smem_l_idx)[552];
+    HS_KEY_TYPE r1_3 = (shared.m + smem_r_idx)[560];
+    HS_KEY_TYPE r1_4 = (shared.m + smem_r_idx)[568];
+    HS_CMP_XCHG(r1_2, r1_3)
+    HS_CMP_XCHG(r1_1, r1_4)
+    HS_CMP_XCHG(r1_3, r1_4)
+    HS_CMP_XCHG(r1_1, r1_2)
+    (shared.m + smem_l_idx)[544] = r1_1;
+    (shared.m + smem_l_idx)[552] = r1_2;
+    (shared.m + smem_r_idx)[560] = r1_3;
+    (shared.m + smem_r_idx)[568] = r1_4;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[8 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[8 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[8 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[8 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[8 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[8 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[8 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[8 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[8 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[8 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[8 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[8 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[8 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[8 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[8 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[8 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[24];
+    HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[32];
+    HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[40];
+    HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[48];
+    HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[56];
+    HS_CMP_XCHG(r0_4, r0_5)
+    HS_CMP_XCHG(r0_3, r0_6)
+    HS_CMP_XCHG(r0_2, r0_7)
+    HS_CMP_XCHG(r0_1, r0_8)
+    HS_CMP_XCHG(r0_5, r0_7)
+    HS_CMP_XCHG(r0_6, r0_8)
+    HS_CMP_XCHG(r0_5, r0_6)
+    HS_CMP_XCHG(r0_7, r0_8)
+    HS_CMP_XCHG(r0_1, r0_3)
+    HS_CMP_XCHG(r0_2, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    HS_CMP_XCHG(r0_3, r0_4)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_l_idx)[16] = r0_3;
+    (shared.m + smem_l_idx)[24] = r0_4;
+    (shared.m + smem_r_idx)[32] = r0_5;
+    (shared.m + smem_r_idx)[40] = r0_6;
+    (shared.m + smem_r_idx)[48] = r0_7;
+    (shared.m + smem_r_idx)[56] = r0_8;
+  }
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[512];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[520];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_l_idx)[528];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_l_idx)[536];
+    HS_KEY_TYPE r0_5 = (shared.m + smem_r_idx)[544];
+    HS_KEY_TYPE r0_6 = (shared.m + smem_r_idx)[552];
+    HS_KEY_TYPE r0_7 = (shared.m + smem_r_idx)[560];
+    HS_KEY_TYPE r0_8 = (shared.m + smem_r_idx)[568];
+    HS_CMP_XCHG(r0_4, r0_5)
+    HS_CMP_XCHG(r0_3, r0_6)
+    HS_CMP_XCHG(r0_2, r0_7)
+    HS_CMP_XCHG(r0_1, r0_8)
+    HS_CMP_XCHG(r0_5, r0_7)
+    HS_CMP_XCHG(r0_6, r0_8)
+    HS_CMP_XCHG(r0_5, r0_6)
+    HS_CMP_XCHG(r0_7, r0_8)
+    HS_CMP_XCHG(r0_1, r0_3)
+    HS_CMP_XCHG(r0_2, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    HS_CMP_XCHG(r0_3, r0_4)
+    (shared.m + smem_l_idx)[512] = r0_1;
+    (shared.m + smem_l_idx)[520] = r0_2;
+    (shared.m + smem_l_idx)[528] = r0_3;
+    (shared.m + smem_l_idx)[536] = r0_4;
+    (shared.m + smem_r_idx)[544] = r0_5;
+    (shared.m + smem_r_idx)[552] = r0_6;
+    (shared.m + smem_r_idx)[560] = r0_7;
+    (shared.m + smem_r_idx)[568] = r0_8;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[8 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[8 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[8 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[8 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[8 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[8 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[8 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[8 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[8 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[8 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[8 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[8 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[8 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[8 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[8 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bs_2(__global HS_KEY_TYPE const* const restrict vin,
+               __global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 32];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r6, r11)
+  HS_CMP_XCHG(r7, r10)
+  HS_CMP_XCHG(r4, r13)
+  HS_CMP_XCHG(r14, r15)
+  HS_CMP_XCHG(r8, r12)
+  HS_CMP_XCHG(r2, r3)
+  HS_CMP_XCHG(r5, r9)
+  HS_CMP_XCHG(r2, r5)
+  HS_CMP_XCHG(r8, r14)
+  HS_CMP_XCHG(r3, r9)
+  HS_CMP_XCHG(r12, r15)
+  HS_CMP_XCHG(r3, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r14)
+  HS_CMP_XCHG(r4, r9)
+  HS_CMP_XCHG(r8, r13)
+  HS_CMP_XCHG(r7, r9)
+  HS_CMP_XCHG(r11, r13)
+  HS_CMP_XCHG(r4, r6)
+  HS_CMP_XCHG(r8, r10)
+  HS_CMP_XCHG(r4, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r8, r9)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r13)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  {
+    uint const flip_lane_mask = 1;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 3;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 7;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 2;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id();
+  uint const smem_r_idx =
+    (get_sub_group_id() ^ 1) * 32 + (get_sub_group_local_id() ^ 7);
+  (shared.m + get_local_id(0))[4 * 8 * 0] = r1;
+  (shared.m + get_local_id(0))[4 * 8 * 1] = r16;
+  (shared.m + get_local_id(0))[4 * 8 * 2] = r2;
+  (shared.m + get_local_id(0))[4 * 8 * 3] = r15;
+  (shared.m + get_local_id(0))[4 * 8 * 4] = r3;
+  (shared.m + get_local_id(0))[4 * 8 * 5] = r14;
+  (shared.m + get_local_id(0))[4 * 8 * 6] = r4;
+  (shared.m + get_local_id(0))[4 * 8 * 7] = r13;
+  (shared.m + get_local_id(0))[4 * 8 * 8] = r5;
+  (shared.m + get_local_id(0))[4 * 8 * 9] = r12;
+  (shared.m + get_local_id(0))[4 * 8 * 10] = r6;
+  (shared.m + get_local_id(0))[4 * 8 * 11] = r11;
+  (shared.m + get_local_id(0))[4 * 8 * 12] = r7;
+  (shared.m + get_local_id(0))[4 * 8 * 13] = r10;
+  (shared.m + get_local_id(0))[4 * 8 * 14] = r8;
+  (shared.m + get_local_id(0))[4 * 8 * 15] = r9;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  {
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_r_idx)[8] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[16];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[24];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[16] = r1_1;
+      (shared.m + smem_r_idx)[24] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[128] = r0_1;
+      (shared.m + smem_r_idx)[136] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[144];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[152];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[144] = r1_1;
+      (shared.m + smem_r_idx)[152] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[264];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[256] = r0_1;
+      (shared.m + smem_r_idx)[264] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[272];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[280];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[272] = r1_1;
+      (shared.m + smem_r_idx)[280] = r1_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[392];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[384] = r0_1;
+      (shared.m + smem_r_idx)[392] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r1_1 = (shared.m + smem_l_idx)[400];
+      HS_KEY_TYPE r1_2 = (shared.m + smem_r_idx)[408];
+      HS_CMP_XCHG(r1_1, r1_2)
+      (shared.m + smem_l_idx)[400] = r1_1;
+      (shared.m + smem_r_idx)[408] = r1_2;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
+  r16 = (shared.m + get_local_id(0))[4 * 8 * 1];
+  r2 = (shared.m + get_local_id(0))[4 * 8 * 2];
+  r15 = (shared.m + get_local_id(0))[4 * 8 * 3];
+  r3 = (shared.m + get_local_id(0))[4 * 8 * 4];
+  r14 = (shared.m + get_local_id(0))[4 * 8 * 5];
+  r4 = (shared.m + get_local_id(0))[4 * 8 * 6];
+  r13 = (shared.m + get_local_id(0))[4 * 8 * 7];
+  r5 = (shared.m + get_local_id(0))[4 * 8 * 8];
+  r12 = (shared.m + get_local_id(0))[4 * 8 * 9];
+  r6 = (shared.m + get_local_id(0))[4 * 8 * 10];
+  r11 = (shared.m + get_local_id(0))[4 * 8 * 11];
+  r7 = (shared.m + get_local_id(0))[4 * 8 * 12];
+  r10 = (shared.m + get_local_id(0))[4 * 8 * 13];
+  r8 = (shared.m + get_local_id(0))[4 * 8 * 14];
+  r9 = (shared.m + get_local_id(0))[4 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(shared.m + get_local_id(0))[4 * 8 * 0] = r1;
+(shared.m + get_local_id(0))[4 * 8 * 1] = r16;
+(shared.m + get_local_id(0))[4 * 8 * 2] = r2;
+(shared.m + get_local_id(0))[4 * 8 * 3] = r15;
+(shared.m + get_local_id(0))[4 * 8 * 4] = r3;
+(shared.m + get_local_id(0))[4 * 8 * 5] = r14;
+(shared.m + get_local_id(0))[4 * 8 * 6] = r4;
+(shared.m + get_local_id(0))[4 * 8 * 7] = r13;
+(shared.m + get_local_id(0))[4 * 8 * 8] = r5;
+(shared.m + get_local_id(0))[4 * 8 * 9] = r12;
+(shared.m + get_local_id(0))[4 * 8 * 10] = r6;
+(shared.m + get_local_id(0))[4 * 8 * 11] = r11;
+(shared.m + get_local_id(0))[4 * 8 * 12] = r7;
+(shared.m + get_local_id(0))[4 * 8 * 13] = r10;
+(shared.m + get_local_id(0))[4 * 8 * 14] = r8;
+(shared.m + get_local_id(0))[4 * 8 * 15] = r9;
+barrier(CLK_LOCAL_MEM_FENCE);
+{
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[8];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[16];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[24];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[0] = r0_1;
+    (shared.m + smem_l_idx)[8] = r0_2;
+    (shared.m + smem_r_idx)[16] = r0_3;
+    (shared.m + smem_r_idx)[24] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[136];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[144];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[152];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[128] = r0_1;
+    (shared.m + smem_l_idx)[136] = r0_2;
+    (shared.m + smem_r_idx)[144] = r0_3;
+    (shared.m + smem_r_idx)[152] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[256];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[264];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[272];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[280];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[256] = r0_1;
+    (shared.m + smem_l_idx)[264] = r0_2;
+    (shared.m + smem_r_idx)[272] = r0_3;
+    (shared.m + smem_r_idx)[280] = r0_4;
+  }
+  {
+    HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[384];
+    HS_KEY_TYPE r0_2 = (shared.m + smem_l_idx)[392];
+    HS_KEY_TYPE r0_3 = (shared.m + smem_r_idx)[400];
+    HS_KEY_TYPE r0_4 = (shared.m + smem_r_idx)[408];
+    HS_CMP_XCHG(r0_2, r0_3)
+    HS_CMP_XCHG(r0_1, r0_4)
+    HS_CMP_XCHG(r0_3, r0_4)
+    HS_CMP_XCHG(r0_1, r0_2)
+    (shared.m + smem_l_idx)[384] = r0_1;
+    (shared.m + smem_l_idx)[392] = r0_2;
+    (shared.m + smem_r_idx)[400] = r0_3;
+    (shared.m + smem_r_idx)[408] = r0_4;
+  }
+}
+barrier(CLK_LOCAL_MEM_FENCE);
+r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
+r16 = (shared.m + get_local_id(0))[4 * 8 * 1];
+r2 = (shared.m + get_local_id(0))[4 * 8 * 2];
+r15 = (shared.m + get_local_id(0))[4 * 8 * 3];
+r3 = (shared.m + get_local_id(0))[4 * 8 * 4];
+r14 = (shared.m + get_local_id(0))[4 * 8 * 5];
+r4 = (shared.m + get_local_id(0))[4 * 8 * 6];
+r13 = (shared.m + get_local_id(0))[4 * 8 * 7];
+r5 = (shared.m + get_local_id(0))[4 * 8 * 8];
+r12 = (shared.m + get_local_id(0))[4 * 8 * 9];
+r6 = (shared.m + get_local_id(0))[4 * 8 * 10];
+r11 = (shared.m + get_local_id(0))[4 * 8 * 11];
+r7 = (shared.m + get_local_id(0))[4 * 8 * 12];
+r10 = (shared.m + get_local_id(0))[4 * 8 * 13];
+r8 = (shared.m + get_local_id(0))[4 * 8 * 14];
+r9 = (shared.m + get_local_id(0))[4 * 8 * 15];
+{ { uint const half_lane_mask = 4;
+uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+int const t_lt = get_sub_group_local_id() < half_lane_idx;
+HS_CMP_HALF(0, r1)
+HS_CMP_HALF(1, r2)
+HS_CMP_HALF(2, r3)
+HS_CMP_HALF(3, r4)
+HS_CMP_HALF(4, r5)
+HS_CMP_HALF(5, r6)
+HS_CMP_HALF(6, r7)
+HS_CMP_HALF(7, r8)
+HS_CMP_HALF(8, r9)
+HS_CMP_HALF(9, r10)
+HS_CMP_HALF(10, r11)
+HS_CMP_HALF(11, r12)
+HS_CMP_HALF(12, r13)
+HS_CMP_HALF(13, r14)
+HS_CMP_HALF(14, r15)
+HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bs_1(__global HS_KEY_TYPE const* const restrict vin,
+               __global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 16];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r6, r11)
+  HS_CMP_XCHG(r7, r10)
+  HS_CMP_XCHG(r4, r13)
+  HS_CMP_XCHG(r14, r15)
+  HS_CMP_XCHG(r8, r12)
+  HS_CMP_XCHG(r2, r3)
+  HS_CMP_XCHG(r5, r9)
+  HS_CMP_XCHG(r2, r5)
+  HS_CMP_XCHG(r8, r14)
+  HS_CMP_XCHG(r3, r9)
+  HS_CMP_XCHG(r12, r15)
+  HS_CMP_XCHG(r3, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r14)
+  HS_CMP_XCHG(r4, r9)
+  HS_CMP_XCHG(r8, r13)
+  HS_CMP_XCHG(r7, r9)
+  HS_CMP_XCHG(r11, r13)
+  HS_CMP_XCHG(r4, r6)
+  HS_CMP_XCHG(r8, r10)
+  HS_CMP_XCHG(r4, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r8, r9)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r13)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  {
+    uint const flip_lane_mask = 1;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 3;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 7;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 2;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id();
+  uint const smem_r_idx =
+    (get_sub_group_id() ^ 1) * 16 + (get_sub_group_local_id() ^ 7);
+  (shared.m + get_local_id(0))[2 * 8 * 0] = r1;
+  (shared.m + get_local_id(0))[2 * 8 * 1] = r16;
+  (shared.m + get_local_id(0))[2 * 8 * 2] = r2;
+  (shared.m + get_local_id(0))[2 * 8 * 3] = r15;
+  (shared.m + get_local_id(0))[2 * 8 * 4] = r3;
+  (shared.m + get_local_id(0))[2 * 8 * 5] = r14;
+  (shared.m + get_local_id(0))[2 * 8 * 6] = r4;
+  (shared.m + get_local_id(0))[2 * 8 * 7] = r13;
+  (shared.m + get_local_id(0))[2 * 8 * 8] = r5;
+  (shared.m + get_local_id(0))[2 * 8 * 9] = r12;
+  (shared.m + get_local_id(0))[2 * 8 * 10] = r6;
+  (shared.m + get_local_id(0))[2 * 8 * 11] = r11;
+  (shared.m + get_local_id(0))[2 * 8 * 12] = r7;
+  (shared.m + get_local_id(0))[2 * 8 * 13] = r10;
+  (shared.m + get_local_id(0))[2 * 8 * 14] = r8;
+  (shared.m + get_local_id(0))[2 * 8 * 15] = r9;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  {
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[8];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_r_idx)[8] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[32];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[40];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[32] = r0_1;
+      (shared.m + smem_r_idx)[40] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[64];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[72];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[64] = r0_1;
+      (shared.m + smem_r_idx)[72] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[96];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[104];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[96] = r0_1;
+      (shared.m + smem_r_idx)[104] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[128];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[136];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[128] = r0_1;
+      (shared.m + smem_r_idx)[136] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[160];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[168];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[160] = r0_1;
+      (shared.m + smem_r_idx)[168] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[192];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[200];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[192] = r0_1;
+      (shared.m + smem_r_idx)[200] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (shared.m + smem_l_idx)[224];
+      HS_KEY_TYPE r0_2 = (shared.m + smem_r_idx)[232];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[224] = r0_1;
+      (shared.m + smem_r_idx)[232] = r0_2;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  r1 = (shared.m + get_local_id(0))[2 * 8 * 0];
+  r16 = (shared.m + get_local_id(0))[2 * 8 * 1];
+  r2 = (shared.m + get_local_id(0))[2 * 8 * 2];
+  r15 = (shared.m + get_local_id(0))[2 * 8 * 3];
+  r3 = (shared.m + get_local_id(0))[2 * 8 * 4];
+  r14 = (shared.m + get_local_id(0))[2 * 8 * 5];
+  r4 = (shared.m + get_local_id(0))[2 * 8 * 6];
+  r13 = (shared.m + get_local_id(0))[2 * 8 * 7];
+  r5 = (shared.m + get_local_id(0))[2 * 8 * 8];
+  r12 = (shared.m + get_local_id(0))[2 * 8 * 9];
+  r6 = (shared.m + get_local_id(0))[2 * 8 * 10];
+  r11 = (shared.m + get_local_id(0))[2 * 8 * 11];
+  r7 = (shared.m + get_local_id(0))[2 * 8 * 12];
+  r10 = (shared.m + get_local_id(0))[2 * 8 * 13];
+  r8 = (shared.m + get_local_id(0))[2 * 8 * 14];
+  r9 = (shared.m + get_local_id(0))[2 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((reqd_work_group_size(8, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bs_0(__global HS_KEY_TYPE const* const restrict vin,
+               __global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vin + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vin + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vin + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vin + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vin + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vin + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vin + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vin + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vin + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vin + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vin + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vin + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vin + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vin + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vin + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vin + gmem_idx)[15 * 8];
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r6, r11)
+  HS_CMP_XCHG(r7, r10)
+  HS_CMP_XCHG(r4, r13)
+  HS_CMP_XCHG(r14, r15)
+  HS_CMP_XCHG(r8, r12)
+  HS_CMP_XCHG(r2, r3)
+  HS_CMP_XCHG(r5, r9)
+  HS_CMP_XCHG(r2, r5)
+  HS_CMP_XCHG(r8, r14)
+  HS_CMP_XCHG(r3, r9)
+  HS_CMP_XCHG(r12, r15)
+  HS_CMP_XCHG(r3, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r14)
+  HS_CMP_XCHG(r4, r9)
+  HS_CMP_XCHG(r8, r13)
+  HS_CMP_XCHG(r7, r9)
+  HS_CMP_XCHG(r11, r13)
+  HS_CMP_XCHG(r4, r6)
+  HS_CMP_XCHG(r8, r10)
+  HS_CMP_XCHG(r4, r5)
+  HS_CMP_XCHG(r6, r7)
+  HS_CMP_XCHG(r8, r9)
+  HS_CMP_XCHG(r10, r11)
+  HS_CMP_XCHG(r12, r13)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  {
+    uint const flip_lane_mask = 1;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 3;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  {
+    uint const flip_lane_mask = 7;
+    uint const flip_lane_idx = get_sub_group_local_id() ^ flip_lane_mask;
+    int const t_lt = get_sub_group_local_id() < flip_lane_idx;
+    HS_CMP_FLIP(0, r1, r16)
+    HS_CMP_FLIP(1, r2, r15)
+    HS_CMP_FLIP(2, r3, r14)
+    HS_CMP_FLIP(3, r4, r13)
+    HS_CMP_FLIP(4, r5, r12)
+    HS_CMP_FLIP(5, r6, r11)
+    HS_CMP_FLIP(6, r7, r10)
+    HS_CMP_FLIP(7, r8, r9)
+  }
+  {
+    uint const half_lane_mask = 2;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  {
+    uint const half_lane_mask = 1;
+    uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+    int const t_lt = get_sub_group_local_id() < half_lane_idx;
+    HS_CMP_HALF(0, r1)
+    HS_CMP_HALF(1, r2)
+    HS_CMP_HALF(2, r3)
+    HS_CMP_HALF(3, r4)
+    HS_CMP_HALF(4, r5)
+    HS_CMP_HALF(5, r6)
+    HS_CMP_HALF(6, r7)
+    HS_CMP_HALF(7, r8)
+    HS_CMP_HALF(8, r9)
+    HS_CMP_HALF(9, r10)
+    HS_CMP_HALF(10, r11)
+    HS_CMP_HALF(11, r12)
+    HS_CMP_HALF(12, r13)
+    HS_CMP_HALF(13, r14)
+    HS_CMP_HALF(14, r15)
+    HS_CMP_HALF(15, r16)
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  (vout + gmem_idx)[0 * 8] = r1;
+  (vout + gmem_idx)[1 * 8] = r2;
+  (vout + gmem_idx)[2 * 8] = r3;
+  (vout + gmem_idx)[3 * 8] = r4;
+  (vout + gmem_idx)[4 * 8] = r5;
+  (vout + gmem_idx)[5 * 8] = r6;
+  (vout + gmem_idx)[6 * 8] = r7;
+  (vout + gmem_idx)[7 * 8] = r8;
+  (vout + gmem_idx)[8 * 8] = r9;
+  (vout + gmem_idx)[9 * 8] = r10;
+  (vout + gmem_idx)[10 * 8] = r11;
+  (vout + gmem_idx)[11 * 8] = r12;
+  (vout + gmem_idx)[12 * 8] = r13;
+  (vout + gmem_idx)[13 * 8] = r14;
+  (vout + gmem_idx)[14 * 8] = r15;
+  (vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bc_4(__global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 128];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  uint const gmem_l_idx = (global_id / 128) * 2048 + (global_id & 127);
+  uint const smem_l_idx = get_sub_group_id() * 128 + get_sub_group_local_id();
+  {
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
+      HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512];
+      HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640];
+      HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768];
+      HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896];
+      HS_KEY_TYPE r0_9 = (vout + gmem_l_idx)[1024];
+      HS_KEY_TYPE r0_10 = (vout + gmem_l_idx)[1152];
+      HS_KEY_TYPE r0_11 = (vout + gmem_l_idx)[1280];
+      HS_KEY_TYPE r0_12 = (vout + gmem_l_idx)[1408];
+      HS_KEY_TYPE r0_13 = (vout + gmem_l_idx)[1536];
+      HS_KEY_TYPE r0_14 = (vout + gmem_l_idx)[1664];
+      HS_KEY_TYPE r0_15 = (vout + gmem_l_idx)[1792];
+      HS_KEY_TYPE r0_16 = (vout + gmem_l_idx)[1920];
+      HS_CMP_XCHG(r0_1, r0_9)
+      HS_CMP_XCHG(r0_5, r0_13)
+      HS_CMP_XCHG(r0_1, r0_5)
+      HS_CMP_XCHG(r0_9, r0_13)
+      HS_CMP_XCHG(r0_3, r0_11)
+      HS_CMP_XCHG(r0_7, r0_15)
+      HS_CMP_XCHG(r0_3, r0_7)
+      HS_CMP_XCHG(r0_11, r0_15)
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_5, r0_7)
+      HS_CMP_XCHG(r0_9, r0_11)
+      HS_CMP_XCHG(r0_13, r0_15)
+      HS_CMP_XCHG(r0_2, r0_10)
+      HS_CMP_XCHG(r0_6, r0_14)
+      HS_CMP_XCHG(r0_2, r0_6)
+      HS_CMP_XCHG(r0_10, r0_14)
+      HS_CMP_XCHG(r0_4, r0_12)
+      HS_CMP_XCHG(r0_8, r0_16)
+      HS_CMP_XCHG(r0_4, r0_8)
+      HS_CMP_XCHG(r0_12, r0_16)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_6, r0_8)
+      HS_CMP_XCHG(r0_10, r0_12)
+      HS_CMP_XCHG(r0_14, r0_16)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      HS_CMP_XCHG(r0_5, r0_6)
+      HS_CMP_XCHG(r0_7, r0_8)
+      HS_CMP_XCHG(r0_9, r0_10)
+      HS_CMP_XCHG(r0_11, r0_12)
+      HS_CMP_XCHG(r0_13, r0_14)
+      HS_CMP_XCHG(r0_15, r0_16)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_l_idx)[8] = r0_2;
+      (shared.m + smem_l_idx)[16] = r0_3;
+      (shared.m + smem_l_idx)[24] = r0_4;
+      (shared.m + smem_l_idx)[32] = r0_5;
+      (shared.m + smem_l_idx)[40] = r0_6;
+      (shared.m + smem_l_idx)[48] = r0_7;
+      (shared.m + smem_l_idx)[56] = r0_8;
+      (shared.m + smem_l_idx)[64] = r0_9;
+      (shared.m + smem_l_idx)[72] = r0_10;
+      (shared.m + smem_l_idx)[80] = r0_11;
+      (shared.m + smem_l_idx)[88] = r0_12;
+      (shared.m + smem_l_idx)[96] = r0_13;
+      (shared.m + smem_l_idx)[104] = r0_14;
+      (shared.m + smem_l_idx)[112] = r0_15;
+      (shared.m + smem_l_idx)[120] = r0_16;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[16 * 8 * 0];
+  HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[16 * 8 * 1];
+  HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[16 * 8 * 2];
+  HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[16 * 8 * 3];
+  HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[16 * 8 * 4];
+  HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[16 * 8 * 5];
+  HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[16 * 8 * 6];
+  HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[16 * 8 * 7];
+  HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[16 * 8 * 8];
+  HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[16 * 8 * 9];
+  HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[16 * 8 * 10];
+  HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[16 * 8 * 11];
+  HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[16 * 8 * 12];
+  HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[16 * 8 * 13];
+  HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[16 * 8 * 14];
+  HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[16 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bc_3(__global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 64];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  uint const gmem_l_idx = (global_id / 64) * 1024 + (global_id & 63);
+  uint const smem_l_idx = get_sub_group_id() * 64 + get_sub_group_local_id();
+  {
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
+      HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[512];
+      HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[640];
+      HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[768];
+      HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[896];
+      HS_CMP_XCHG(r0_1, r0_5)
+      HS_CMP_XCHG(r0_3, r0_7)
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_5, r0_7)
+      HS_CMP_XCHG(r0_2, r0_6)
+      HS_CMP_XCHG(r0_4, r0_8)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_6, r0_8)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      HS_CMP_XCHG(r0_5, r0_6)
+      HS_CMP_XCHG(r0_7, r0_8)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_l_idx)[8] = r0_2;
+      (shared.m + smem_l_idx)[16] = r0_3;
+      (shared.m + smem_l_idx)[24] = r0_4;
+      (shared.m + smem_l_idx)[32] = r0_5;
+      (shared.m + smem_l_idx)[40] = r0_6;
+      (shared.m + smem_l_idx)[48] = r0_7;
+      (shared.m + smem_l_idx)[56] = r0_8;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448];
+      HS_KEY_TYPE r0_5 = (vout + gmem_l_idx)[576];
+      HS_KEY_TYPE r0_6 = (vout + gmem_l_idx)[704];
+      HS_KEY_TYPE r0_7 = (vout + gmem_l_idx)[832];
+      HS_KEY_TYPE r0_8 = (vout + gmem_l_idx)[960];
+      HS_CMP_XCHG(r0_1, r0_5)
+      HS_CMP_XCHG(r0_3, r0_7)
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_5, r0_7)
+      HS_CMP_XCHG(r0_2, r0_6)
+      HS_CMP_XCHG(r0_4, r0_8)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_6, r0_8)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      HS_CMP_XCHG(r0_5, r0_6)
+      HS_CMP_XCHG(r0_7, r0_8)
+      (shared.m + smem_l_idx)[512] = r0_1;
+      (shared.m + smem_l_idx)[520] = r0_2;
+      (shared.m + smem_l_idx)[528] = r0_3;
+      (shared.m + smem_l_idx)[536] = r0_4;
+      (shared.m + smem_l_idx)[544] = r0_5;
+      (shared.m + smem_l_idx)[552] = r0_6;
+      (shared.m + smem_l_idx)[560] = r0_7;
+      (shared.m + smem_l_idx)[568] = r0_8;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[8 * 8 * 0];
+  HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[8 * 8 * 1];
+  HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[8 * 8 * 2];
+  HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[8 * 8 * 3];
+  HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[8 * 8 * 4];
+  HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[8 * 8 * 5];
+  HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[8 * 8 * 6];
+  HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[8 * 8 * 7];
+  HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[8 * 8 * 8];
+  HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[8 * 8 * 9];
+  HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[8 * 8 * 10];
+  HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[8 * 8 * 11];
+  HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[8 * 8 * 12];
+  HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[8 * 8 * 13];
+  HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[8 * 8 * 14];
+  HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[8 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bc_2(__global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 32];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  uint const gmem_l_idx = (global_id / 32) * 512 + (global_id & 31);
+  uint const smem_l_idx = get_sub_group_id() * 32 + get_sub_group_local_id();
+  {
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[256];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[384];
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_l_idx)[8] = r0_2;
+      (shared.m + smem_l_idx)[16] = r0_3;
+      (shared.m + smem_l_idx)[24] = r0_4;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[288];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[416];
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      (shared.m + smem_l_idx)[128] = r0_1;
+      (shared.m + smem_l_idx)[136] = r0_2;
+      (shared.m + smem_l_idx)[144] = r0_3;
+      (shared.m + smem_l_idx)[152] = r0_4;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[320];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[448];
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      (shared.m + smem_l_idx)[256] = r0_1;
+      (shared.m + smem_l_idx)[264] = r0_2;
+      (shared.m + smem_l_idx)[272] = r0_3;
+      (shared.m + smem_l_idx)[280] = r0_4;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224];
+      HS_KEY_TYPE r0_3 = (vout + gmem_l_idx)[352];
+      HS_KEY_TYPE r0_4 = (vout + gmem_l_idx)[480];
+      HS_CMP_XCHG(r0_1, r0_3)
+      HS_CMP_XCHG(r0_2, r0_4)
+      HS_CMP_XCHG(r0_1, r0_2)
+      HS_CMP_XCHG(r0_3, r0_4)
+      (shared.m + smem_l_idx)[384] = r0_1;
+      (shared.m + smem_l_idx)[392] = r0_2;
+      (shared.m + smem_l_idx)[400] = r0_3;
+      (shared.m + smem_l_idx)[408] = r0_4;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[4 * 8 * 0];
+  HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[4 * 8 * 1];
+  HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[4 * 8 * 2];
+  HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[4 * 8 * 3];
+  HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[4 * 8 * 4];
+  HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[4 * 8 * 5];
+  HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[4 * 8 * 6];
+  HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[4 * 8 * 7];
+  HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[4 * 8 * 8];
+  HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[4 * 8 * 9];
+  HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[4 * 8 * 10];
+  HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[4 * 8 * 11];
+  HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[4 * 8 * 12];
+  HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[4 * 8 * 13];
+  HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[4 * 8 * 14];
+  HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[4 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bc_1(__global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+    HS_KEY_TYPE m[16 * 16];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  uint const gmem_l_idx = (global_id / 16) * 256 + (global_id & 15);
+  uint const smem_l_idx = get_sub_group_id() * 16 + get_sub_group_local_id();
+  {
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[0];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[128];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[0] = r0_1;
+      (shared.m + smem_l_idx)[8] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[16];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[144];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[32] = r0_1;
+      (shared.m + smem_l_idx)[40] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[32];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[160];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[64] = r0_1;
+      (shared.m + smem_l_idx)[72] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[48];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[176];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[96] = r0_1;
+      (shared.m + smem_l_idx)[104] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[64];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[192];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[128] = r0_1;
+      (shared.m + smem_l_idx)[136] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[80];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[208];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[160] = r0_1;
+      (shared.m + smem_l_idx)[168] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[96];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[224];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[192] = r0_1;
+      (shared.m + smem_l_idx)[200] = r0_2;
+    }
+    {
+      HS_KEY_TYPE r0_1 = (vout + gmem_l_idx)[112];
+      HS_KEY_TYPE r0_2 = (vout + gmem_l_idx)[240];
+      HS_CMP_XCHG(r0_1, r0_2)
+      (shared.m + smem_l_idx)[224] = r0_1;
+      (shared.m + smem_l_idx)[232] = r0_2;
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  HS_KEY_TYPE r1 = (shared.m + get_local_id(0))[2 * 8 * 0];
+  HS_KEY_TYPE r2 = (shared.m + get_local_id(0))[2 * 8 * 1];
+  HS_KEY_TYPE r3 = (shared.m + get_local_id(0))[2 * 8 * 2];
+  HS_KEY_TYPE r4 = (shared.m + get_local_id(0))[2 * 8 * 3];
+  HS_KEY_TYPE r5 = (shared.m + get_local_id(0))[2 * 8 * 4];
+  HS_KEY_TYPE r6 = (shared.m + get_local_id(0))[2 * 8 * 5];
+  HS_KEY_TYPE r7 = (shared.m + get_local_id(0))[2 * 8 * 6];
+  HS_KEY_TYPE r8 = (shared.m + get_local_id(0))[2 * 8 * 7];
+  HS_KEY_TYPE r9 = (shared.m + get_local_id(0))[2 * 8 * 8];
+  HS_KEY_TYPE r10 = (shared.m + get_local_id(0))[2 * 8 * 9];
+  HS_KEY_TYPE r11 = (shared.m + get_local_id(0))[2 * 8 * 10];
+  HS_KEY_TYPE r12 = (shared.m + get_local_id(0))[2 * 8 * 11];
+  HS_KEY_TYPE r13 = (shared.m + get_local_id(0))[2 * 8 * 12];
+  HS_KEY_TYPE r14 = (shared.m + get_local_id(0))[2 * 8 * 13];
+  HS_KEY_TYPE r15 = (shared.m + get_local_id(0))[2 * 8 * 14];
+  HS_KEY_TYPE r16 = (shared.m + get_local_id(0))[2 * 8 * 15];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_bc_0(__global HS_KEY_TYPE* const restrict vout)
+{
+  __local union
+  {
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_idx = (global_id / 8) * 128 + (global_id & 7);
+
+  HS_KEY_TYPE r1 = (vout + gmem_idx)[0 * 8];
+  HS_KEY_TYPE r2 = (vout + gmem_idx)[1 * 8];
+  HS_KEY_TYPE r3 = (vout + gmem_idx)[2 * 8];
+  HS_KEY_TYPE r4 = (vout + gmem_idx)[3 * 8];
+  HS_KEY_TYPE r5 = (vout + gmem_idx)[4 * 8];
+  HS_KEY_TYPE r6 = (vout + gmem_idx)[5 * 8];
+  HS_KEY_TYPE r7 = (vout + gmem_idx)[6 * 8];
+  HS_KEY_TYPE r8 = (vout + gmem_idx)[7 * 8];
+  HS_KEY_TYPE r9 = (vout + gmem_idx)[8 * 8];
+  HS_KEY_TYPE r10 = (vout + gmem_idx)[9 * 8];
+  HS_KEY_TYPE r11 = (vout + gmem_idx)[10 * 8];
+  HS_KEY_TYPE r12 = (vout + gmem_idx)[11 * 8];
+  HS_KEY_TYPE r13 = (vout + gmem_idx)[12 * 8];
+  HS_KEY_TYPE r14 = (vout + gmem_idx)[13 * 8];
+  HS_KEY_TYPE r15 = (vout + gmem_idx)[14 * 8];
+  HS_KEY_TYPE r16 = (vout + gmem_idx)[15 * 8];
+  { { uint const half_lane_mask = 4;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 2;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+{
+  uint const half_lane_mask = 1;
+  uint const half_lane_idx = get_sub_group_local_id() ^ half_lane_mask;
+  int const t_lt = get_sub_group_local_id() < half_lane_idx;
+  HS_CMP_HALF(0, r1)
+  HS_CMP_HALF(1, r2)
+  HS_CMP_HALF(2, r3)
+  HS_CMP_HALF(3, r4)
+  HS_CMP_HALF(4, r5)
+  HS_CMP_HALF(5, r6)
+  HS_CMP_HALF(6, r7)
+  HS_CMP_HALF(7, r8)
+  HS_CMP_HALF(8, r9)
+  HS_CMP_HALF(9, r10)
+  HS_CMP_HALF(10, r11)
+  HS_CMP_HALF(11, r12)
+  HS_CMP_HALF(12, r13)
+  HS_CMP_HALF(13, r14)
+  HS_CMP_HALF(14, r15)
+  HS_CMP_HALF(15, r16)
+}
+HS_CMP_XCHG(r1, r9)
+HS_CMP_XCHG(r5, r13)
+HS_CMP_XCHG(r1, r5)
+HS_CMP_XCHG(r9, r13)
+HS_CMP_XCHG(r3, r11)
+HS_CMP_XCHG(r7, r15)
+HS_CMP_XCHG(r3, r7)
+HS_CMP_XCHG(r11, r15)
+HS_CMP_XCHG(r1, r3)
+HS_CMP_XCHG(r5, r7)
+HS_CMP_XCHG(r9, r11)
+HS_CMP_XCHG(r13, r15)
+HS_CMP_XCHG(r2, r10)
+HS_CMP_XCHG(r6, r14)
+HS_CMP_XCHG(r2, r6)
+HS_CMP_XCHG(r10, r14)
+HS_CMP_XCHG(r4, r12)
+HS_CMP_XCHG(r8, r16)
+HS_CMP_XCHG(r4, r8)
+HS_CMP_XCHG(r12, r16)
+HS_CMP_XCHG(r2, r4)
+HS_CMP_XCHG(r6, r8)
+HS_CMP_XCHG(r10, r12)
+HS_CMP_XCHG(r14, r16)
+HS_CMP_XCHG(r1, r2)
+HS_CMP_XCHG(r3, r4)
+HS_CMP_XCHG(r5, r6)
+HS_CMP_XCHG(r7, r8)
+HS_CMP_XCHG(r9, r10)
+HS_CMP_XCHG(r11, r12)
+HS_CMP_XCHG(r13, r14)
+HS_CMP_XCHG(r15, r16)
+}
+(vout + gmem_idx)[0 * 8] = r1;
+(vout + gmem_idx)[1 * 8] = r2;
+(vout + gmem_idx)[2 * 8] = r3;
+(vout + gmem_idx)[3 * 8] = r4;
+(vout + gmem_idx)[4 * 8] = r5;
+(vout + gmem_idx)[5 * 8] = r6;
+(vout + gmem_idx)[6 * 8] = r7;
+(vout + gmem_idx)[7 * 8] = r8;
+(vout + gmem_idx)[8 * 8] = r9;
+(vout + gmem_idx)[9 * 8] = r10;
+(vout + gmem_idx)[10 * 8] = r11;
+(vout + gmem_idx)[11 * 8] = r12;
+(vout + gmem_idx)[12 * 8] = r13;
+(vout + gmem_idx)[13 * 8] = r14;
+(vout + gmem_idx)[14 * 8] = r15;
+(vout + gmem_idx)[15 * 8] = r16;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_1(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 0;
+
+  uint const merge_stride = 16 * 8 << 0;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 0)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_2(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 1;
+
+  uint const merge_stride = 16 * 8 << 1;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 1)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_3(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 2;
+
+  uint const merge_stride = 16 * 8 << 2;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 2)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_4(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 3;
+
+  uint const merge_stride = 16 * 8 << 3;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 3)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_5(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 4;
+
+  uint const merge_stride = 16 * 8 << 4;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 4)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_6(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 5;
+
+  uint const merge_stride = 16 * 8 << 5;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 5)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_5(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 0;
+
+  uint const merge_stride = 16 * 8 << 0;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 0)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_7(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 6;
+
+  uint const merge_stride = 16 * 8 << 6;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 6)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_6(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 1;
+
+  uint const merge_stride = 16 * 8 << 1;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 1)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_8(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 7;
+
+  uint const merge_stride = 16 * 8 << 7;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 7)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_7(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 2;
+
+  uint const merge_stride = 16 * 8 << 2;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 2)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_9(__global HS_KEY_TYPE* const restrict vout,
+               uint const fm_full,
+               uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 8;
+
+  uint const merge_stride = 16 * 8 << 8;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 8)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_8(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 3;
+
+  uint const merge_stride = 16 * 8 << 3;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 3)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_10(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 9;
+
+  uint const merge_stride = 16 * 8 << 9;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 9)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_9(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 4;
+
+  uint const merge_stride = 16 * 8 << 4;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 4)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_11(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 10;
+
+  uint const merge_stride = 16 * 8 << 10;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 10)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_10(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 5;
+
+  uint const merge_stride = 16 * 8 << 5;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 5)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_12(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 11;
+
+  uint const merge_stride = 16 * 8 << 11;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 11)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_11(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 6;
+
+  uint const merge_stride = 16 * 8 << 6;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 6)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_13(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 12;
+
+  uint const merge_stride = 16 * 8 << 12;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 12)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_12(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 7;
+
+  uint const merge_stride = 16 * 8 << 7;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 7)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_14(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 13;
+
+  uint const merge_stride = 16 * 8 << 13;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 13)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_13(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 8;
+
+  uint const merge_stride = 16 * 8 << 8;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 8)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_15(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 14;
+
+  uint const merge_stride = 16 * 8 << 14;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 14)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_14(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 9;
+
+  uint const merge_stride = 16 * 8 << 9;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 9)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_fm_16(__global HS_KEY_TYPE* const restrict vout,
+                uint const fm_full,
+                uint const fm_frac)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = warp_idx / 16 >> 15;
+
+  uint const merge_stride = 16 * 8 << 15;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+
+  uint const merge_l_off =
+    (warp_idx - merge_idx * (16 << 15)) * 8 + warp_lane_idx;
+  uint const merge_l_end = merge_stride * (32 / 2 - 1) + merge_l_off;
+
+  int const merge_r_off = merge_keys - merge_l_end - 1;
+
+  __global HS_KEY_TYPE* const restrict merge_l =
+    vout + (merge_base + merge_l_off);
+  __global HS_KEY_TYPE* const restrict merge_r =
+    vout + (merge_base + merge_r_off);
+
+  HS_KEY_TYPE r1 = merge_l[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_l[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_l[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_l[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_l[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_l[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_l[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_l[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_l[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_l[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_l[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_l[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_l[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_l[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_l[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_l[15 * merge_stride];
+  if (merge_idx < fm_full) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_KEY_TYPE r25 = merge_r[8 * merge_stride];
+    HS_KEY_TYPE r26 = merge_r[9 * merge_stride];
+    HS_KEY_TYPE r27 = merge_r[10 * merge_stride];
+    HS_KEY_TYPE r28 = merge_r[11 * merge_stride];
+    HS_KEY_TYPE r29 = merge_r[12 * merge_stride];
+    HS_KEY_TYPE r30 = merge_r[13 * merge_stride];
+    HS_KEY_TYPE r31 = merge_r[14 * merge_stride];
+    HS_KEY_TYPE r32 = merge_r[15 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r8, r25)
+    HS_CMP_XCHG(r7, r26)
+    HS_CMP_XCHG(r6, r27)
+    HS_CMP_XCHG(r5, r28)
+    HS_CMP_XCHG(r4, r29)
+    HS_CMP_XCHG(r3, r30)
+    HS_CMP_XCHG(r2, r31)
+    HS_CMP_XCHG(r1, r32)
+    HS_CMP_XCHG(r17, r25)
+    HS_CMP_XCHG(r21, r29)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r25, r29)
+    HS_CMP_XCHG(r19, r27)
+    HS_CMP_XCHG(r23, r31)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r27, r31)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r25, r27)
+    HS_CMP_XCHG(r29, r31)
+    HS_CMP_XCHG(r18, r26)
+    HS_CMP_XCHG(r22, r30)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r26, r30)
+    HS_CMP_XCHG(r20, r28)
+    HS_CMP_XCHG(r24, r32)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r28, r32)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r26, r28)
+    HS_CMP_XCHG(r30, r32)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    HS_CMP_XCHG(r25, r26)
+    HS_CMP_XCHG(r27, r28)
+    HS_CMP_XCHG(r29, r30)
+    HS_CMP_XCHG(r31, r32)
+    merge_r[15 * merge_stride] = r32;
+    merge_r[14 * merge_stride] = r31;
+    merge_r[13 * merge_stride] = r30;
+    merge_r[12 * merge_stride] = r29;
+    merge_r[11 * merge_stride] = r28;
+    merge_r[10 * merge_stride] = r27;
+    merge_r[9 * merge_stride] = r26;
+    merge_r[8 * merge_stride] = r25;
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 8) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_KEY_TYPE r21 = merge_r[4 * merge_stride];
+    HS_KEY_TYPE r22 = merge_r[5 * merge_stride];
+    HS_KEY_TYPE r23 = merge_r[6 * merge_stride];
+    HS_KEY_TYPE r24 = merge_r[7 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r12, r21)
+    HS_CMP_XCHG(r11, r22)
+    HS_CMP_XCHG(r10, r23)
+    HS_CMP_XCHG(r9, r24)
+    HS_CMP_XCHG(r17, r21)
+    HS_CMP_XCHG(r19, r23)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r21, r23)
+    HS_CMP_XCHG(r18, r22)
+    HS_CMP_XCHG(r20, r24)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r22, r24)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    HS_CMP_XCHG(r21, r22)
+    HS_CMP_XCHG(r23, r24)
+    merge_r[7 * merge_stride] = r24;
+    merge_r[6 * merge_stride] = r23;
+    merge_r[5 * merge_stride] = r22;
+    merge_r[4 * merge_stride] = r21;
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 4) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_KEY_TYPE r19 = merge_r[2 * merge_stride];
+    HS_KEY_TYPE r20 = merge_r[3 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r14, r19)
+    HS_CMP_XCHG(r13, r20)
+    HS_CMP_XCHG(r17, r19)
+    HS_CMP_XCHG(r18, r20)
+    HS_CMP_XCHG(r17, r18)
+    HS_CMP_XCHG(r19, r20)
+    merge_r[3 * merge_stride] = r20;
+    merge_r[2 * merge_stride] = r19;
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else if (fm_frac == 2) {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_KEY_TYPE r18 = merge_r[1 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    HS_CMP_XCHG(r15, r18)
+    HS_CMP_XCHG(r17, r18)
+    merge_r[1 * merge_stride] = r18;
+    merge_r[0 * merge_stride] = r17;
+  } else {
+    HS_KEY_TYPE r17 = merge_r[0 * merge_stride];
+    HS_CMP_XCHG(r16, r17)
+    merge_r[0 * merge_stride] = r17;
+  }
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  merge_l[15 * merge_stride] = r16;
+  merge_l[14 * merge_stride] = r15;
+  merge_l[13 * merge_stride] = r14;
+  merge_l[12 * merge_stride] = r13;
+  merge_l[11 * merge_stride] = r12;
+  merge_l[10 * merge_stride] = r11;
+  merge_l[9 * merge_stride] = r10;
+  merge_l[8 * merge_stride] = r9;
+  merge_l[7 * merge_stride] = r8;
+  merge_l[6 * merge_stride] = r7;
+  merge_l[5 * merge_stride] = r6;
+  merge_l[4 * merge_stride] = r5;
+  merge_l[3 * merge_stride] = r4;
+  merge_l[2 * merge_stride] = r3;
+  merge_l[1 * merge_stride] = r2;
+  merge_l[0 * merge_stride] = r1;
+}
+
+__kernel __attribute__((intel_reqd_sub_group_size(8))) void
+hs_kernel_hm_15(__global HS_KEY_TYPE* const restrict vout)
+{
+  uint const global_id = (uint)get_global_id(0);
+  uint const warp_idx = global_id / 8;
+  uint const warp_lane_idx = global_id & 7;
+
+  uint const merge_idx = (warp_idx / 16) >> 10;
+
+  uint const merge_stride = 16 * 8 << 10;
+  uint const merge_keys = merge_stride * 32;
+
+  uint const merge_base = merge_idx * merge_keys;
+  uint const merge_off = (warp_idx - merge_idx * (16 << 10)) * 8;
+
+  __global HS_KEY_TYPE* const restrict merge_ptr =
+    vout + (merge_base + merge_off + warp_lane_idx);
+
+  HS_KEY_TYPE r1 = merge_ptr[0 * merge_stride];
+  HS_KEY_TYPE r2 = merge_ptr[1 * merge_stride];
+  HS_KEY_TYPE r3 = merge_ptr[2 * merge_stride];
+  HS_KEY_TYPE r4 = merge_ptr[3 * merge_stride];
+  HS_KEY_TYPE r5 = merge_ptr[4 * merge_stride];
+  HS_KEY_TYPE r6 = merge_ptr[5 * merge_stride];
+  HS_KEY_TYPE r7 = merge_ptr[6 * merge_stride];
+  HS_KEY_TYPE r8 = merge_ptr[7 * merge_stride];
+  HS_KEY_TYPE r9 = merge_ptr[8 * merge_stride];
+  HS_KEY_TYPE r10 = merge_ptr[9 * merge_stride];
+  HS_KEY_TYPE r11 = merge_ptr[10 * merge_stride];
+  HS_KEY_TYPE r12 = merge_ptr[11 * merge_stride];
+  HS_KEY_TYPE r13 = merge_ptr[12 * merge_stride];
+  HS_KEY_TYPE r14 = merge_ptr[13 * merge_stride];
+  HS_KEY_TYPE r15 = merge_ptr[14 * merge_stride];
+  HS_KEY_TYPE r16 = merge_ptr[15 * merge_stride];
+  HS_KEY_TYPE r17 = merge_ptr[16 * merge_stride];
+  HS_KEY_TYPE r18 = merge_ptr[17 * merge_stride];
+  HS_KEY_TYPE r19 = merge_ptr[18 * merge_stride];
+  HS_KEY_TYPE r20 = merge_ptr[19 * merge_stride];
+  HS_KEY_TYPE r21 = merge_ptr[20 * merge_stride];
+  HS_KEY_TYPE r22 = merge_ptr[21 * merge_stride];
+  HS_KEY_TYPE r23 = merge_ptr[22 * merge_stride];
+  HS_KEY_TYPE r24 = merge_ptr[23 * merge_stride];
+  HS_KEY_TYPE r25 = merge_ptr[24 * merge_stride];
+  HS_KEY_TYPE r26 = merge_ptr[25 * merge_stride];
+  HS_KEY_TYPE r27 = merge_ptr[26 * merge_stride];
+  HS_KEY_TYPE r28 = merge_ptr[27 * merge_stride];
+  HS_KEY_TYPE r29 = merge_ptr[28 * merge_stride];
+  HS_KEY_TYPE r30 = merge_ptr[29 * merge_stride];
+  HS_KEY_TYPE r31 = merge_ptr[30 * merge_stride];
+  HS_KEY_TYPE r32 = merge_ptr[31 * merge_stride];
+  HS_CMP_XCHG(r1, r17)
+  HS_CMP_XCHG(r9, r25)
+  HS_CMP_XCHG(r1, r9)
+  HS_CMP_XCHG(r17, r25)
+  HS_CMP_XCHG(r5, r21)
+  HS_CMP_XCHG(r13, r29)
+  HS_CMP_XCHG(r5, r13)
+  HS_CMP_XCHG(r21, r29)
+  HS_CMP_XCHG(r1, r5)
+  HS_CMP_XCHG(r9, r13)
+  HS_CMP_XCHG(r17, r21)
+  HS_CMP_XCHG(r25, r29)
+  HS_CMP_XCHG(r3, r19)
+  HS_CMP_XCHG(r11, r27)
+  HS_CMP_XCHG(r3, r11)
+  HS_CMP_XCHG(r19, r27)
+  HS_CMP_XCHG(r7, r23)
+  HS_CMP_XCHG(r15, r31)
+  HS_CMP_XCHG(r7, r15)
+  HS_CMP_XCHG(r23, r31)
+  HS_CMP_XCHG(r3, r7)
+  HS_CMP_XCHG(r11, r15)
+  HS_CMP_XCHG(r19, r23)
+  HS_CMP_XCHG(r27, r31)
+  HS_CMP_XCHG(r1, r3)
+  HS_CMP_XCHG(r5, r7)
+  HS_CMP_XCHG(r9, r11)
+  HS_CMP_XCHG(r13, r15)
+  HS_CMP_XCHG(r17, r19)
+  HS_CMP_XCHG(r21, r23)
+  HS_CMP_XCHG(r25, r27)
+  HS_CMP_XCHG(r29, r31)
+  HS_CMP_XCHG(r2, r18)
+  HS_CMP_XCHG(r10, r26)
+  HS_CMP_XCHG(r2, r10)
+  HS_CMP_XCHG(r18, r26)
+  HS_CMP_XCHG(r6, r22)
+  HS_CMP_XCHG(r14, r30)
+  HS_CMP_XCHG(r6, r14)
+  HS_CMP_XCHG(r22, r30)
+  HS_CMP_XCHG(r2, r6)
+  HS_CMP_XCHG(r10, r14)
+  HS_CMP_XCHG(r18, r22)
+  HS_CMP_XCHG(r26, r30)
+  HS_CMP_XCHG(r4, r20)
+  HS_CMP_XCHG(r12, r28)
+  HS_CMP_XCHG(r4, r12)
+  HS_CMP_XCHG(r20, r28)
+  HS_CMP_XCHG(r8, r24)
+  HS_CMP_XCHG(r16, r32)
+  HS_CMP_XCHG(r8, r16)
+  HS_CMP_XCHG(r24, r32)
+  HS_CMP_XCHG(r4, r8)
+  HS_CMP_XCHG(r12, r16)
+  HS_CMP_XCHG(r20, r24)
+  HS_CMP_XCHG(r28, r32)
+  HS_CMP_XCHG(r2, r4)
+  HS_CMP_XCHG(r6, r8)
+  HS_CMP_XCHG(r10, r12)
+  HS_CMP_XCHG(r14, r16)
+  HS_CMP_XCHG(r18, r20)
+  HS_CMP_XCHG(r22, r24)
+  HS_CMP_XCHG(r26, r28)
+  HS_CMP_XCHG(r30, r32)
+  HS_CMP_XCHG(r1, r2)
+  HS_CMP_XCHG(r3, r4)
+  HS_CMP_XCHG(r5, r6)
+  HS_CMP_XCHG(r7, r8)
+  HS_CMP_XCHG(r9, r10)
+  HS_CMP_XCHG(r11, r12)
+  HS_CMP_XCHG(r13, r14)
+  HS_CMP_XCHG(r15, r16)
+  HS_CMP_XCHG(r17, r18)
+  HS_CMP_XCHG(r19, r20)
+  HS_CMP_XCHG(r21, r22)
+  HS_CMP_XCHG(r23, r24)
+  HS_CMP_XCHG(r25, r26)
+  HS_CMP_XCHG(r27, r28)
+  HS_CMP_XCHG(r29, r30)
+  HS_CMP_XCHG(r31, r32)
+  merge_ptr[31 * merge_stride] = r32;
+  merge_ptr[30 * merge_stride] = r31;
+  merge_ptr[29 * merge_stride] = r30;
+  merge_ptr[28 * merge_stride] = r29;
+  merge_ptr[27 * merge_stride] = r28;
+  merge_ptr[26 * merge_stride] = r27;
+  merge_ptr[25 * merge_stride] = r26;
+  merge_ptr[24 * merge_stride] = r25;
+  merge_ptr[23 * merge_stride] = r24;
+  merge_ptr[22 * merge_stride] = r23;
+  merge_ptr[21 * merge_stride] = r22;
+  merge_ptr[20 * merge_stride] = r21;
+  merge_ptr[19 * merge_stride] = r20;
+  merge_ptr[18 * merge_stride] = r19;
+  merge_ptr[17 * merge_stride] = r18;
+  merge_ptr[16 * merge_stride] = r17;
+  merge_ptr[15 * merge_stride] = r16;
+  merge_ptr[14 * merge_stride] = r15;
+  merge_ptr[13 * merge_stride] = r14;
+  merge_ptr[12 * merge_stride] = r13;
+  merge_ptr[11 * merge_stride] = r12;
+  merge_ptr[10 * merge_stride] = r11;
+  merge_ptr[9 * merge_stride] = r10;
+  merge_ptr[8 * merge_stride] = r9;
+  merge_ptr[7 * merge_stride] = r8;
+  merge_ptr[6 * merge_stride] = r7;
+  merge_ptr[5 * merge_stride] = r6;
+  merge_ptr[4 * merge_stride] = r5;
+  merge_ptr[3 * merge_stride] = r4;
+  merge_ptr[2 * merge_stride] = r3;
+  merge_ptr[1 * merge_stride] = r2;
+  merge_ptr[0 * merge_stride] = r1;
+}
+
+//
+//
+//
diff --git a/src/compute/hs/cl/gen9/hs_cl.h b/src/compute/hs/cl/gen9/hs_cl.h
new file mode 100644
index 0000000..a33b2b7
--- /dev/null
+++ b/src/compute/hs/cl/gen9/hs_cl.h
@@ -0,0 +1,122 @@
+//                                                            
+// Copyright 2016 Google Inc.                                 
+//                                                            
+// Use of this source code is governed by a BSD-style         
+// license that can be found in the LICENSE file.             
+//                                                            
+                                                              
+#ifndef HS_CL_ONCE                                            
+#define HS_CL_ONCE                                            
+                                                              
+#define HS_LANES_PER_WARP_LOG2  3                            
+#define HS_LANES_PER_WARP       (1 << HS_LANES_PER_WARP_LOG2) 
+#define HS_BS_WARPS             16                            
+#define HS_BS_WARPS_LOG2_RU     4                            
+#define HS_BC_WARPS_LOG2_MAX    4                            
+#define HS_FM_BLOCKS_LOG2_MIN   1                            
+#define HS_HM_BLOCKS_LOG2_MIN   1                            
+#define HS_KEYS_PER_LANE        16                            
+#define HS_REG_LAST(c)          c##16                         
+#define HS_KEY_WORDS            2                            
+#define HS_KEY_TYPE             ulong                            
+#define HS_EMPTY                                              
+                                                              
+#define HS_SLAB_ROWS()    \
+  HS_SLAB_ROW(   1,   0 ) \
+  HS_SLAB_ROW(   2,   1 ) \
+  HS_SLAB_ROW(   3,   2 ) \
+  HS_SLAB_ROW(   4,   3 ) \
+  HS_SLAB_ROW(   5,   4 ) \
+  HS_SLAB_ROW(   6,   5 ) \
+  HS_SLAB_ROW(   7,   6 ) \
+  HS_SLAB_ROW(   8,   7 ) \
+  HS_SLAB_ROW(   9,   8 ) \
+  HS_SLAB_ROW(  10,   9 ) \
+  HS_SLAB_ROW(  11,  10 ) \
+  HS_SLAB_ROW(  12,  11 ) \
+  HS_SLAB_ROW(  13,  12 ) \
+  HS_SLAB_ROW(  14,  13 ) \
+  HS_SLAB_ROW(  15,  14 ) \
+  HS_SLAB_ROW(  16,  15 ) \
+  HS_EMPTY
+          
+#define HS_TRANSPOSE_SLAB()                \
+  HS_TRANSPOSE_STAGE( 1 )                  \
+  HS_TRANSPOSE_STAGE( 2 )                  \
+  HS_TRANSPOSE_STAGE( 3 )                  \
+  HS_TRANSPOSE_BLEND( r, s,  1,   2,   1 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   4,   3 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   6,   5 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   8,   7 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  10,   9 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  12,  11 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  14,  13 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  16,  15 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   3,   1 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   4,   2 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   7,   5 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   8,   6 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  11,   9 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  12,  10 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  15,  13 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  16,  14 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   5,   1 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   6,   2 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   7,   3 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   8,   4 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  13,   9 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  14,  10 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  15,  11 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  16,  12 ) \
+  HS_TRANSPOSE_REMAP( u,   1,   1 )        \
+  HS_TRANSPOSE_REMAP( u,   2,   3 )        \
+  HS_TRANSPOSE_REMAP( u,   3,   5 )        \
+  HS_TRANSPOSE_REMAP( u,   4,   7 )        \
+  HS_TRANSPOSE_REMAP( u,   5,   9 )        \
+  HS_TRANSPOSE_REMAP( u,   6,  11 )        \
+  HS_TRANSPOSE_REMAP( u,   7,  13 )        \
+  HS_TRANSPOSE_REMAP( u,   8,  15 )        \
+  HS_TRANSPOSE_REMAP( u,   9,   2 )        \
+  HS_TRANSPOSE_REMAP( u,  10,   4 )        \
+  HS_TRANSPOSE_REMAP( u,  11,   6 )        \
+  HS_TRANSPOSE_REMAP( u,  12,   8 )        \
+  HS_TRANSPOSE_REMAP( u,  13,  10 )        \
+  HS_TRANSPOSE_REMAP( u,  14,  12 )        \
+  HS_TRANSPOSE_REMAP( u,  15,  14 )        \
+  HS_TRANSPOSE_REMAP( u,  16,  16 )        \
+  HS_EMPTY
+          
+#define HS_FM_BLOCKS_LOG2_1    0 
+#define HS_FM_BLOCKS_LOG2_2    1 
+#define HS_FM_BLOCKS_LOG2_3    2 
+#define HS_FM_BLOCKS_LOG2_4    3 
+#define HS_FM_BLOCKS_LOG2_5    4 
+#define HS_FM_BLOCKS_LOG2_6    5 
+#define HS_HM_BLOCKS_LOG2_5    0 
+#define HS_FM_BLOCKS_LOG2_7    6 
+#define HS_HM_BLOCKS_LOG2_6    1 
+#define HS_FM_BLOCKS_LOG2_8    7 
+#define HS_HM_BLOCKS_LOG2_7    2 
+#define HS_FM_BLOCKS_LOG2_9    8 
+#define HS_HM_BLOCKS_LOG2_8    3 
+#define HS_FM_BLOCKS_LOG2_10   9 
+#define HS_HM_BLOCKS_LOG2_9    4 
+#define HS_FM_BLOCKS_LOG2_11   10 
+#define HS_HM_BLOCKS_LOG2_10   5 
+#define HS_FM_BLOCKS_LOG2_12   11 
+#define HS_HM_BLOCKS_LOG2_11   6 
+#define HS_FM_BLOCKS_LOG2_13   12 
+#define HS_HM_BLOCKS_LOG2_12   7 
+#define HS_FM_BLOCKS_LOG2_14   13 
+#define HS_HM_BLOCKS_LOG2_13   8 
+#define HS_FM_BLOCKS_LOG2_15   14 
+#define HS_HM_BLOCKS_LOG2_14   9 
+#define HS_FM_BLOCKS_LOG2_16   15 
+#define HS_HM_BLOCKS_LOG2_15   10 
+                                
+#endif                          
+                                
+//                              
+//                              
+//                              
+                                
diff --git a/src/compute/hs/cl/gen9/hs_cl_macros.h b/src/compute/hs/cl/gen9/hs_cl_macros.h
new file mode 100644
index 0000000..d314fe8
--- /dev/null
+++ b/src/compute/hs/cl/gen9/hs_cl_macros.h
@@ -0,0 +1,199 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_CL_MACROS_ONCE
+#define HS_CL_MACROS_ONCE
+
+//
+//
+//
+
+#include "hs_cl.h"
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b)                     \
+  {                                             \
+    HS_KEY_TYPE const t = min(a,b);             \
+    b = max(a,b);                               \
+    a = t;                                      \
+  }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b)                     \
+  {                                             \
+    HS_KEY_TYPE const tmp = a;                  \
+    a  = (a < b) ? a : b;                       \
+    b ^= a ^ tmp;                               \
+  }
+
+// best
+#define HS_CMP_XCHG_V2(a,b)                     \
+  if (a >= b) {                                 \
+    HS_KEY_TYPE const t = a;                    \
+    a = b;                                      \
+    b = t;                                      \
+  }
+
+// good
+#define HS_CMP_XCHG_V3(a,b)                     \
+  {                                             \
+    int         const ge = a >= b;              \
+    HS_KEY_TYPE const t  = a;                   \
+    a = ge ? b : a;                             \
+    b = ge ? t : b;                             \
+  }
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b)                                              \
+  {                                                                     \
+    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx);    \
+    HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx);    \
+    a = HS_COND_MIN_MAX(t_lt,a,tb);                                     \
+    b = HS_COND_MIN_MAX(t_lt,b,ta);                                     \
+  }
+
+#define HS_CMP_HALF(i,a)                                                \
+  {                                                                     \
+    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx);    \
+    a = HS_COND_MIN_MAX(t_lt,a,ta);                                     \
+  }
+
+//
+// The device's comparison operator might return what we actually
+// want.  For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a)   (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a)   (a)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+//  - if the flag is false, return min(a,b)
+//  - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes").  The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_TRANSPOSE_REG(prefix,row)   prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row)  HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
+
+#define HS_TRANSPOSE_DELTA(level)     (HS_LANES_PER_WARP + (1 << (level-1)))
+#define HS_TRANSPOSE_IF(level)        ((get_sub_group_local_id() >> (level - 1)) & 1)
+
+#define HS_TRANSPOSE_LL(level)        HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
+#define HS_TRANSPOSE_UR(level)        HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0
+
+#define HS_TRANSPOSE_DELTA_LL(level)  delta_ll_##level
+#define HS_TRANSPOSE_DELTA_UR(level)  delta_ur_##level
+
+#define HS_TRANSPOSE_STAGE(level)                                       \
+  uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level);     \
+  uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ll) =                               \
+    intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll),  \
+                                 HS_TRANSPOSE_REG(prefix_prev,row_ur),  \
+                                 HS_TRANSPOSE_DELTA_LL(level));         \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ur) =                               \
+    intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll),    \
+                               HS_TRANSPOSE_REG(prefix_prev,row_ur),    \
+                               HS_TRANSPOSE_DELTA_UR(level));           \
+
+// #define HS_TRANSPOSE_LOAD(row)                                        \
+//   HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to)                      \
+  (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] =             \
+    HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+// undefine these if you want to override
+//
+
+#define HS_TRANSPOSE_PREAMBLE()
+#define HS_TRANSPOSE_BODY()
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/context.c b/src/compute/skc/context.c
index 8aac2ef..59c7956 100644
--- a/src/compute/skc/context.c
+++ b/src/compute/skc/context.c
@@ -28,23 +28,20 @@
 //
 
 skc_err
-skc_context_create(skc_context_t       * context,
-                   char          const * target_platform_substring,
-                   char          const * target_device_substring,
-                   intptr_t              context_properties[])
+skc_context_create_cl(skc_context_t * context,
+                      cl_context      context_cl,
+                      cl_device_id    device_id_cl)
 {
   (*context) = malloc(sizeof(**context));
 
   //
-  // FIXME -- don't directly grab a CL runtime but for now juts create
-  // the CL_12 runtime here
+  // FIXME -- we'll clean up context creation by platform later.  For
+  // now, just create a CL_12 context.
   //
   skc_err err;
 
-  err = skc_runtime_cl_12_create(*context,
-                                 target_platform_substring,
-                                 target_device_substring,
-                                 context_properties);
+  err = skc_runtime_cl_12_create(*context,context_cl,device_id_cl);
+
   return err;
 }
 
diff --git a/src/compute/skc/main.c b/src/compute/skc/main.c
index 8261f4b..e0d42b3 100644
--- a/src/compute/skc/main.c
+++ b/src/compute/skc/main.c
@@ -21,6 +21,11 @@
 #include <stdlib.h>
 #include <conio.h>
 
+#include "skc_create_cl.h"
+
+#include "common/cl/find_cl.h"
+#include "common/cl/assert_cl.h"
+
 #include "svg/svg_doc.h"
 #include "svg2skc/svg2skc.h"
 #include "svg2skc/transform_stack.h"
@@ -49,7 +54,7 @@
 //
 //
 
-static 
+static
 void
 is_render_complete(skc_surface_t     surface,
                    skc_styling_t     styling,
@@ -67,9 +72,9 @@
 main(int argc, char** argv)
 {
   //
-  // 
   //
-  if (argc <= 1) 
+  //
+  if (argc <= 1)
     {
       fprintf(stderr,"-- missing filename\n");
       return EXIT_FAILURE; // no filename
@@ -95,28 +100,49 @@
   skc_interop_init(&window);
 
   //
+  // find platform and device by name
+  //
+  cl_platform_id platform_id_cl;
+  cl_device_id   device_id_cl;
+
+  cl(FindIdsByName("Intel","Graphics",
+                   &platform_id_cl,
+                   &device_id_cl,
+                   0,NULL,NULL,
+                   true));
+
+  //
   // get GL and device contexts
   //
   HGLRC hGLRC = wglGetCurrentContext();
   HDC   hDC   = wglGetCurrentDC();
 
   //
+  // create the CL context
   //
-  //
-  cl_context_properties context_properties[] =
+  cl_context_properties context_properties_cl[] =
     {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)-1,
+      CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id_cl,
       CL_GL_CONTEXT_KHR,   (cl_context_properties)hGLRC,
       CL_WGL_HDC_KHR,      (cl_context_properties)hDC,
       0
     };
-  
+
+  cl_int     cl_err;
+  cl_context context_cl = clCreateContext(context_properties_cl,
+                                          1,
+                                          &device_id_cl,
+                                          NULL,
+                                          NULL,
+                                          &cl_err); cl_ok(cl_err);
   //
-  // create context
+  // create SKC context
   //
   skc_context_t context;
 
-  skc_err err = skc_context_create(&context,"Intel","Graphics",context_properties);
+  skc_err err = skc_context_create_cl(&context,
+                                      context_cl,
+                                      device_id_cl);
 
   //
   // associate
@@ -136,14 +162,14 @@
   skc_raster_builder_t raster_builder;
 
   err = skc_raster_builder_create(context,&raster_builder);
-  
+
   //
   // create a composition
   //
   skc_composition_t composition;
 
   err = skc_composition_create(context,&composition);
-  
+
   //
   // create a styling instance
   //
@@ -154,7 +180,7 @@
                            svg_doc_layer_count(svg_doc),
                            1000,
                            2 * 1024 * 1024);
-  
+
   //
   // create a surface
   //
@@ -191,7 +217,7 @@
       skc_transform_stack_restore(ts,ts_save);
 
       // decode layers -- places rasters
-      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);    
+      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
 
       // seal the composition
       skc_composition_seal(composition);
@@ -244,7 +270,7 @@
       // unseal the composition
       skc_composition_unseal(composition,true);
     }
-  
+
   //
   // dispose of mundane resources
   //
diff --git a/src/compute/skc/platforms/cl_12/allocator_device_cl.c b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
index aa44f36..90ae26e 100644
--- a/src/compute/skc/platforms/cl_12/allocator_device_cl.c
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
@@ -106,7 +106,7 @@
                           &runtime->allocator.device.temp.suballocator,
                           "DEVICE",
                           runtime->config->suballocator.device.subbufs,
-                          runtime->cl.base_align,
+                          runtime->cl.align_bytes,
                           runtime->config->suballocator.device.size);
 
 #ifndef NDEBUG
diff --git a/src/compute/skc/platforms/cl_12/config_cl.h b/src/compute/skc/platforms/cl_12/config_cl.h
index 0172857..ac5cd76 100644
--- a/src/compute/skc/platforms/cl_12/config_cl.h
+++ b/src/compute/skc/platforms/cl_12/config_cl.h
@@ -12,7 +12,6 @@
 //
 //
 
-#include "runtime_cl.h"
 #include "block_pool_cl.h"
 
 //
@@ -52,8 +51,8 @@
   union skc_block_pool_size  block_pool;
 
   struct {
-    skc_cq_type_e            type;
-    skc_uint                 size;
+    cl_command_queue_properties cq_props;
+    skc_uint                    size;
   } cq_pool;
 
   struct {
diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.c b/src/compute/skc/platforms/cl_12/cq_pool_cl.c
index 80cfe34..8d1537d 100644
--- a/src/compute/skc/platforms/cl_12/cq_pool_cl.c
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c
@@ -7,17 +7,18 @@
  */
 
 //
-//
+// squelch OpenCL 1.2 deprecation warning
 //
 
-#ifndef NDEBUG
-#include <stdio.h>
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #endif
 
 //
 //
 //
 
+#include <stdio.h>
 #include <string.h>
 
 //
@@ -25,6 +26,7 @@
 //
 
 #include "runtime_cl_12.h"
+#include "common/cl/assert_cl.h"
 
 //
 // This implementation is probably excessive.
@@ -40,21 +42,77 @@
 //
 //
 
-void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size)
-{
-  pool->type   = type;
-  pool->size   = size + 1; // an empty spot
-  pool->reads  = 0;
-  pool->writes = size;
-  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
+static
+cl_command_queue
+skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime,
+                            struct skc_cq_pool * const pool)
 
-  for (skc_uint ii=0; ii<size; ii++) {
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
-  }
+{
+  cl_command_queue cq;
+
+#if 1
+      //
+      // <= OpenCL 1.2
+      //
+      cl_int cl_err;
+
+      cq = clCreateCommandQueue(runtime->cl.context,
+                                runtime->cl.device_id,
+                                pool->cq_props,
+                                &cl_err); cl_ok(cl_err);
+#else
+  if (runtime_cl->version.major < 2)
+    {
+      //
+      // <= OpenCL 1.2
+      //
+      cl_int cl_err;
+
+      cq = clCreateCommandQueue(runtime_cl->context,
+                                runtime_cl->device_id,
+                                (cl_command_queue_properties)type,
+                                &cl_err); cl_ok(cl_err);
+    }
+  else
+    {
+      //
+      // >= OpenCL 2.0
+      //
+      cl_int                    cl_err;
+      cl_queue_properties const queue_properties[] = {
+        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
+      };
+
+      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
+                                              runtime_cl->device_id,
+                                              queue_properties,
+                                              &cl_err); cl_ok(cl_err);
+    }
+#endif
+
+  return cq;
+}
+
+//
+//
+//
+
+void
+skc_cq_pool_create(struct skc_runtime        * const runtime,
+                   struct skc_cq_pool        * const pool,
+                   cl_command_queue_properties const cq_props,
+                   skc_uint                    const size)
+{
+  pool->size     = size + 1; // an empty spot
+  pool->reads    = 0;
+  pool->writes   = size;
+
+  pool->cq_props = cq_props;
+  pool->cq       = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                               pool->size * sizeof(*pool->cq));
+  for (skc_uint ii=0; ii<size; ii++)
+    pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
+
   pool->cq[size] = NULL;
 }
 
@@ -77,7 +135,7 @@
 //
 //
 
-static 
+static
 void
 skc_cq_pool_write(struct skc_cq_pool * const pool,
                   cl_command_queue           cq)
@@ -109,14 +167,14 @@
   pool->writes = expand;
 
   for (skc_uint ii=0; ii<expand; ii++)
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
+    pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
 }
 
 //
 //
 //
 
-static 
+static
 cl_command_queue
 skc_cq_pool_read(struct skc_runtime * const runtime,
                  struct skc_cq_pool * const pool)
@@ -141,7 +199,7 @@
 }
 
 void
-skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, 
+skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
                                 cl_command_queue           cq)
 {
   skc_cq_pool_write(&runtime->cq_pool,cq);
diff --git a/src/compute/skc/platforms/cl_12/cq_pool_cl.h b/src/compute/skc/platforms/cl_12/cq_pool_cl.h
index 0cc73a2..c614600 100644
--- a/src/compute/skc/platforms/cl_12/cq_pool_cl.h
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h
@@ -20,11 +20,12 @@
 
 struct skc_cq_pool
 {
-  skc_cq_type_e      type;
-  skc_uint           size;
-  skc_uint           reads;
-  skc_uint           writes;
-  cl_command_queue * cq;
+  cl_command_queue          * cq;
+  cl_command_queue_properties cq_props;
+
+  skc_uint                    size;
+  skc_uint                    reads;
+  skc_uint                    writes;
 };
 
 //l
@@ -32,10 +33,10 @@
 //
 
 void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size);
+skc_cq_pool_create(struct skc_runtime        * const runtime,
+                   struct skc_cq_pool        * const pool,
+                   cl_command_queue_properties const cq_props,
+                   skc_uint                    const size);
 
 void
 skc_cq_pool_dispose(struct skc_runtime * const runtime,
diff --git a/src/compute/skc/platforms/cl_12/device_cl_12.h b/src/compute/skc/platforms/cl_12/device_cl_12.h
index 637b61a..ef57495 100644
--- a/src/compute/skc/platforms/cl_12/device_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/device_cl_12.h
@@ -77,6 +77,10 @@
 skc_device_acquire_kernel(struct skc_device  * const device, 
                           skc_device_kernel_id const type);
 
+void
+skc_device_release_kernel(struct skc_device  * const device,
+                          cl_kernel                  kernel);
+
 //
 // grid shape can vary greatly by target platform
 //
diff --git a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
index 726b0a7..5abbe18 100644
--- a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
@@ -1,64 +1,64 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "device_cl_12.h"

-

-//

-// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ

-//

-

-__kernel

-SKC_BP_INIT_IDS_KERNEL_ATTRIBS

-void

-skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)

-{

-  uint const gid = get_global_id(0);

-

-  //

-  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to

-  // accomplish this with fewer threads and using either IPC and/or

-  // vector stores -- it should be on certain architectures!

-  //

-

-  //

-  // initialize pool with sequence

-  //

-  if (gid < bp_size)

-    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;

-}

-

-//

-//

-//

-

-__kernel

-SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS

-void

-skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)

-{

-  // the version test is to squelch a bug with the Intel OpenCL CPU

-  // compiler declaring it supports the cl_intel_subgroups extension

-#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)

-  uint const tid = get_sub_group_local_id();

-#else

-  uint const tid = get_local_id(0);

-#endif

-

-  //

-  // launch two threads and store [ 0, bp_size ]

-  //

-  bp_atomics[tid] = tid * bp_size;

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "kernel_cl_12.h"
+
+//
+// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
+//
+
+__kernel
+SKC_BP_INIT_IDS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
+{
+  uint const gid = get_global_id(0);
+
+  //
+  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
+  // accomplish this with fewer threads and using either IPC and/or
+  // vector stores -- it should be on certain architectures!
+  //
+
+  //
+  // initialize pool with sequence
+  //
+  if (gid < bp_size)
+    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
+}
+
+//
+//
+//
+
+__kernel
+SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
+{
+  // the version test is to squelch a bug with the Intel OpenCL CPU
+  // compiler declaring it supports the cl_intel_subgroups extension
+#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
+  uint const tid = get_sub_group_local_id();
+#else
+  uint const tid = get_local_id(0);
+#endif
+
+  //
+  // launch two threads and store [ 0, bp_size ]
+  //
+  bp_atomics[tid] = tid * bp_size;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
index aebe8fd..f7e06a1 100644
--- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
@@ -19,6 +19,7 @@
 #include "config_cl.h"
 #include "runtime_cl_12.h"
 
+#include "kernel_cl_12.h"
 #include "device_cl_12.h"
 
 #include "hs/cl/hs_cl_launcher.h"
@@ -124,9 +125,9 @@
 
     .cq_pool     = {
 #ifndef NDEBUG
-      .type         = SKC_CQ_TYPE_IN_ORDER_PROFILING,
+       .cq_props    = CL_QUEUE_PROFILING_ENABLE,
 #else
-      .type         = 0,
+       .cq_props    = 0,
 #endif
       .size         = 8
     },
@@ -841,6 +842,14 @@
   return kernel;
 }
 
+
+void
+skc_device_release_kernel(struct skc_device  * const device,
+                          cl_kernel                  kernel)
+{
+  cl(ReleaseKernel(kernel));
+}
+
 //
 // INITIALIZE KERNEL ARGS
 //
diff --git a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
similarity index 100%
rename from src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.h
rename to src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
diff --git a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
index 39fee75..bcff0a3 100644
--- a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
@@ -1,309 +1,309 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "block.h"

-#include "path.h"

-#include "common.h"

-#include "atomic_cl.h"

-#include "raster_builder_cl_12.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)

-

-#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)

-#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)

-

-#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)

-

-//

-//

-//

-

-#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)

-

-//

-//

-//

-

-#if   ( SKC_FILLS_EXPAND_X == 1 )

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0

-

-#elif ( SKC_FILLS_EXPAND_X == 2 )

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1

-

-#elif ( SKC_FILLS_EXPAND_X == 4 )

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3

-

-#elif ( SKC_FILLS_EXPAND_X == 8 )

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7

-

-#elif ( SKC_FILLS_EXPAND_X == 16)

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()

-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15

-

-#else

-#error "MISSING SKC_FILLS_EXPAND_X"

-#endif

-

-//

-// Fill and rasterize cmds only differ in their first word semantics

-//

-

-union skc_cmd_expand

-{

-  union skc_cmd_fill      fill;

-  union skc_cmd_rasterize rasterize;

-};

-

-//

-//

-//

-

-union skc_path_elem

-{

-  skc_uint  u32;

-  skc_float f32;

-};

-

-//

-// COMPILE-TIME AND RUN-TIME MACROS

-//

-

-#define SKC_ELEM_IN_RANGE(X,I)                                          \

-  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \

-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

-

-#define SKC_ELEM_GTE(X,I)                                       \

-  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

-

-//

-// FIXME -- slate these for replacement

-//

-

-#define SKC_BROADCAST(E,S,I)                                            \

-  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)

-

-#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \

-  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)

-

-#define SKC_BROADCAST_LAST(E,I)                 \

-  SKC_BROADCAST_LAST_HELPER(E,I)

-

-//

-//

-//

-

-void

-skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,

-                    skc_uint                         * const out_idx,

-                    union skc_cmd_expand             * const cmd,

-                    union skc_path_elem                const e,

-                    skc_uint                           const e_idx)

-{

-  //

-  // FIXME -- we can append a large number of nodeword indices to a

-  // local SMEM queue and flush when full.  It may or may not be a

-  // performance win on some architectures.

-  //

-  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;

-  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);

-

-  cmd->rasterize.nodeword = e_idx;

-

-  if (is_elem) {

-    cmds_out[*out_idx + offset] = cmd->rasterize;

-  }

-

-  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);

-}

-

-//

-//

-//

-

-__kernel

-SKC_FILLS_EXPAND_KERNEL_ATTRIBS

-void

-skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,

-                        __global skc_uint                volatile * const atomics,

-                        __global skc_block_id_t          const    * const map,

-                        __global union skc_cmd_fill      const    * const cmds_in,

-                        __global union skc_cmd_rasterize          * const cmds_out)

-{

-  //

-  // Need to harmonize the way we determine a subgroup's id.  In this

-  // kernel it's not as important because no local memory is being

-  // used.  Although the device/mask calc to determine subgroup and

-  // lanes is still proper, we might want to make it clearer that

-  // we're working with subgroups by using the subgroup API.

-  //

-  // every subgroup/simd that will work on the block loads the same command

-  //

-#if (__OPENCL_VERSION__ < 200)

-  skc_uint const       cmd_stride = get_num_sub_groups();

-#else

-  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups

-#endif

-  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();

-

-  // load fill command -- we reuse y component

-  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };

-

-  // get the path header block from the map

-  skc_block_id_t       id         = map[cmd.fill.path];

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("expand[%u] = %u\n",cmd_idx,id);

-#endif

-

-  //

-  // blindly load all of the head elements into registers

-  //

-  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];

-

-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-  //

-  // pick out count.nodes and count.prims from the header

-  //

-  skc_uint count_nodes, count_prims;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \

-    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \

-  }                                                                     \

-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \

-    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \

-  }

-

-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-  //

-  // debug of path head

-  //

-#if 0

-  skc_uint count_blocks;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \

-    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \

-  }

-

-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-  if (get_sub_group_local_id() == 0)

-    printf("path header = { %5u, %5u, %5u }\n",

-           count_blocks,count_nodes,count_prims);

-#endif

-

-  //

-  // acquire slots in the expanded cmd extent

-  //

-  // decrement prim_idx by 1 so we can use inclusive warp scan later

-  //

-  skc_uint out_idx = 0;

-

-  if (get_sub_group_local_id() == 0) {

-    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP

-      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;

-  }

-

-  out_idx = sub_group_broadcast(out_idx,0);

-

-  //

-  // process ids trailing the path header

-  //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \

-    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \

-      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \

-        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \

-      }                                                                 \

-    }                                                                   \

-    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \

-                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \

-  }

-

-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-  //

-  // we're done if it was just the header

-  //

-  if (count_nodes == 0)

-    return;

-

-  //

-  // otherwise, process the nodes

-  //

-

-  //

-  // get id of next node

-  //

-  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));

-

-  //

-  // the following blocks are nodes

-  //

-  while (true)

-    {

-      // get index of each element

-      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();

-

-      //

-      // blindly load all of the node elements into registers

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];

-

-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-      //

-      // append all valid ids

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \

-                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);

-

-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();

-

-      // any more nodes?

-      if (--count_nodes == 0)

-        return;

-

-      //

-      // get id of next node

-      //

-      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
+#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_FILLS_EXPAND_X == 1 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_FILLS_EXPAND_X == 2 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_FILLS_EXPAND_X == 4 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_FILLS_EXPAND_X == 8 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_FILLS_EXPAND_X == 16)
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_FILLS_EXPAND_X"
+#endif
+
+//
+// Fill and rasterize cmds only differ in their first word semantics
+//
+
+union skc_cmd_expand
+{
+  union skc_cmd_fill      fill;
+  union skc_cmd_rasterize rasterize;
+};
+
+//
+//
+//
+
+union skc_path_elem
+{
+  skc_uint  u32;
+  skc_float f32;
+};
+
+//
+// COMPILE-TIME AND RUN-TIME MACROS
+//
+
+#define SKC_ELEM_IN_RANGE(X,I)                                          \
+  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_ELEM_GTE(X,I)                                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
+  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+//
+//
+
+void
+skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
+                    skc_uint                         * const out_idx,
+                    union skc_cmd_expand             * const cmd,
+                    union skc_path_elem                const e,
+                    skc_uint                           const e_idx)
+{
+  //
+  // FIXME -- we can append a large number of nodeword indices to a
+  // local SMEM queue and flush when full.  It may or may not be a
+  // performance win on some architectures.
+  //
+  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
+  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
+
+  cmd->rasterize.nodeword = e_idx;
+
+  if (is_elem) {
+    cmds_out[*out_idx + offset] = cmd->rasterize;
+  }
+
+  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
+}
+
+//
+//
+//
+
+__kernel
+SKC_FILLS_EXPAND_KERNEL_ATTRIBS
+void
+skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
+                        __global skc_uint                volatile * const atomics,
+                        __global skc_block_id_t          const    * const map,
+                        __global union skc_cmd_fill      const    * const cmds_in,
+                        __global union skc_cmd_rasterize          * const cmds_out)
+{
+  //
+  // Need to harmonize the way we determine a subgroup's id.  In this
+  // kernel it's not as important because no local memory is being
+  // used.  Although the device/mask calc to determine subgroup and
+  // lanes is still proper, we might want to make it clearer that
+  // we're working with subgroups by using the subgroup API.
+  //
+  // every subgroup/simd that will work on the block loads the same command
+  //
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const       cmd_stride = get_num_sub_groups();
+#else
+  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();
+
+  // load fill command -- we reuse y component
+  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };
+
+  // get the path header block from the map
+  skc_block_id_t       id         = map[cmd.fill.path];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("expand[%u] = %u\n",cmd_idx,id);
+#endif
+
+  //
+  // blindly load all of the head elements into registers
+  //
+  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // pick out count.nodes and count.prims from the header
+  //
+  skc_uint count_nodes, count_prims;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
+    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
+  }                                                                     \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
+    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // debug of path head
+  //
+#if 0
+  skc_uint count_blocks;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
+    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  if (get_sub_group_local_id() == 0)
+    printf("path header = { %5u, %5u, %5u }\n",
+           count_blocks,count_nodes,count_prims);
+#endif
+
+  //
+  // acquire slots in the expanded cmd extent
+  //
+  // decrement prim_idx by 1 so we can use inclusive warp scan later
+  //
+  skc_uint out_idx = 0;
+
+  if (get_sub_group_local_id() == 0) {
+    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
+      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
+  }
+
+  out_idx = sub_group_broadcast(out_idx,0);
+
+  //
+  // process ids trailing the path header
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
+    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
+      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
+        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
+      }                                                                 \
+    }                                                                   \
+    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
+                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // we're done if it was just the header
+  //
+  if (count_nodes == 0)
+    return;
+
+  //
+  // otherwise, process the nodes
+  //
+
+  //
+  // get id of next node
+  //
+  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+
+  //
+  // the following blocks are nodes
+  //
+  while (true)
+    {
+      // get index of each element
+      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+      //
+      // blindly load all of the node elements into registers
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      //
+      // append all valid ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
+                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      // any more nodes?
+      if (--count_nodes == 0)
+        return;
+
+      //
+      // get id of next node
+      //
+      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
index 302ea14..63a1a43 100644
--- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
@@ -1,543 +1,543 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "path.h"

-#include "block_pool_cl.h"

-#include "path_builder_cl_12.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#if 0

-

-//

-// SIMD AVX2

-//

-

-#define SKC_PATHS_COPY_WORDS_PER_ELEM          8

-#define SKC_PATHS_COPY_SUBGROUP_SIZE           1

-#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES

-

-typedef skc_uint8  skc_paths_copy_elem;

-typedef skc_uint8  skc_pb_idx_v;

-

-#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()

-

-#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)

-

-#endif

-

-//

-//

-//

-

-#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)

-#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)

-#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)

-#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)

-

-// FIXME -- use SUBGROUP terminology everywhere

-#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)

-

-//

-//

-//

-

-#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \

-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))

-

-#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \

-  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))

-

-// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)

-

-//

-//

-//

-

-//

-// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL

-//

-

-#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)

-

-#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)

-

-#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)

-

-#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)

-

-#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)

-

-#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))

-

-//

-//

-//

-

-skc_uint

-skc_sub_group_local_id()

-{

-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1

-  return get_sub_group_local_id();

-#else

-  return 0;

-#endif

-}

-

-//

-// convert an atomic read counter offset to a block id

-//

-

-skc_block_id_t

-skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,

-                 skc_uint                        const bp_idx_mask,

-                 skc_uint                        const bp_reads,

-                 skc_uint                        const bp_off)

-{

-  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;

-

-  return bp_ids[bp_idx];

-}

-

-//

-//

-//

-

-void

-skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to

-              skc_uint                             const bp_elems_idx,

-              __global skc_paths_copy_elem const * const pb_elems, // from

-              skc_uint                             const pb_elems_idx)

-{

-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)

-    {

-      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];

-    }

-

-#if 0

-  //

-  // NOTE THIS IS PRINTING 8 ROWS

-  //

-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",

-         (skc_uint)get_global_id(0),pb_elems_idx,

-         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));

-  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",

-         (skc_uint)get_global_id(0),pb_elems_idx,

-         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),

-         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));

-#endif

-}

-

-//

-//

-//

-

-void

-skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to

-              skc_uint                             const bp_elems_idx,

-              __global skc_block_id_t      const * const bp_ids,

-              skc_uint                             const bp_reads,

-              skc_uint                             const bp_idx_mask,

-              __global skc_paths_copy_elem const * const pb_elems, // from

-              skc_uint                             const pb_elems_idx,

-              skc_uint                             const pb_rolling)

-{

-  //

-  // remap block id tags bp_elems the host-side rolling counter pb_elems a

-  // device-side block pool id

-  //

-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)

-    {

-      // load block_id_tag words

-      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];

-

-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid

-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

-

-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

-

-      //

-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load

-      // will _always_ be safe as long as we don't use the loaded

-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead

-      // of iterating over the vector components.

-      //

-

-      // only convert if original elem is not invalid

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                 \

-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \

-        skc_block_id_t const b = bp_ids[bp_idx C];              \

-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \

-      }

-

-      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);

-      

-      SKC_PATHS_COPY_ELEM_EXPAND();

-

-      // store the elem back

-      (bp_elems+bp_elems_idx)[ii] = elem;

-    }

-}

-

-//

-//

-//

-

-void

-skc_host_map_update(__global skc_uint * const host_map,

-                    skc_uint            const block,

-                    skc_paths_copy_elem const elem)

-{

-  //

-  // write first elem to map -- FIXME -- this is a little nasty

-  // because it relies on the the host handle always being the first

-  // word in the path header.

-  //

-  // OTOH, this is not unreasonable.  The alternative is to have a

-  // separate kernel initializing the map.

-  //

-#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1

-  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)

-#endif

-    {

-#if SKC_PATHS_COPY_ELEM_WORDS == 1

-      host_map[elem] = block; 

-#if 0

-      printf("[%u] = %u\n",elem,block);

-#endif

-#else

-      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;

-#endif

-    }

-}

-

-//

-//

-//

-

-void

-skc_copy_head(__global skc_uint                  * const host_map,

-              skc_uint                             const block,

-              __global skc_paths_copy_elem       * const bp_elems, // to

-              skc_uint                             const bp_elems_idx,

-              __global skc_block_id_t      const * const bp_ids,

-              skc_uint                             const bp_reads,

-              skc_uint                             const bp_idx_mask,

-              __global skc_paths_copy_elem const * const pb_elems, // from

-              skc_uint                             const pb_elems_idx,

-              skc_uint                             const pb_rolling)

-{

-  //

-  // if there are more path header words than there are

-  // threads-per-block then we can just copy the initial header words

-  //

-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )

-  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)

-    {

-      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];

-

-      (bp_elems+bp_elems_idx)[ii] = elem;

-

-      if (ii == 0) {

-        skc_host_map_update(host_map,block,elem);

-      }

-    }

-#endif

-

-  //

-  // this is similar to copy node but the first H words of the path

-  // header are not modified and simply copied

-  //

-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)

-    {

-      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];

-

-#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )

-      if (ii == 0) {

-        skc_host_map_update(host_map,block,elem);

-      }

-#endif

-      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid

-      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

-

-      //

-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load

-      // will _always_ be safe as long as we don't use the loaded

-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead

-      // of iterating over the vector components.

-      //

-

-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

-

-      // FIXME -- MIX MIX MIX MIX / SELECT

-

-      // only convert if original elem is not invalid

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \

-        skc_block_id_t const b = bp_ids[bp_idx C];                      \

-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \

-      }

-

-      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);

-

-      SKC_PATHS_COPY_ELEM_EXPAND();

-

-      // store the elem back

-      (bp_elems+bp_elems_idx)[ii] = elem;

-    }

-

-  //

-  // the remaining words are treated like a node

-  //

-  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)

-    {

-      // load block_id_tag words

-      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];

-

-      // calculate ahead of time

-      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;

-

-      //

-      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load

-      // will _always_ be safe as long as we don't use the loaded

-      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead

-      // of iterating over the vector components.

-      //

-

-      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS

-

-      // only convert if original elem is not invalid

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                 \

-      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \

-        skc_block_id_t const b = bp_ids[bp_idx C];              \

-        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \

-      }

-

-      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);

-

-      SKC_PATHS_COPY_ELEM_EXPAND();

-

-      // store the elem

-      (bp_elems+bp_elems_idx)[ii] = elem;

-    }

-}

-

-//

-// FIXME -- pack some of these constant integer args in a vec or struct

-//

-

-__kernel

-SKC_PATHS_COPY_KERNEL_ATTRIBS

-void

-skc_kernel_paths_copy

-(__global skc_uint                        * const host_map,

-

- __global skc_block_id_t            const * const bp_ids,

- __global skc_paths_copy_elem             * const bp_elems,

- skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring

-

- __global skc_uint                  const * const bp_alloc,    // block pool ring base

- skc_uint                                   const bp_alloc_idx,// which subbuf

-

- __global union skc_tagged_block_id const * const pb_cmds,

- __global skc_paths_copy_elem       const * const pb_elems,

-

- skc_uint                                   const pb_size,     // # of commands/blocks in buffer

- skc_uint                                   const pb_rolling,  // shifted rolling counter base

-

- skc_uint                                   const pb_prev_from,

- skc_uint                                   const pb_prev_span,

- skc_uint                                   const pb_curr_from)

-{

-  //

-  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:

-  //

-  // - HEAD

-  // - NODE

-  // - SEGS

-  //

-  // THESE ARE SUBGROUP ORIENTED KERNELS

-  //

-  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS

-  //

-

-  //

-  // It's likely that peak bandwidth is achievable with a single

-  // workgroup.

-  //

-  // So let's keep the grids modestly sized and for simplicity and

-  // portability, let's assume that a single workgroup can perform all

-  // steps in the copy.

-  //

-  // Launch as large of a workgroup as possiblex

-  //

-  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL

-  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS

-  // 3. FOR EACH COMMAND:

-  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.

-  //      - NODE: CONVERT AND COPY B INDICES

-  //      - SEGS: BULK COPY

-  //

-  // B : number of words in block -- always pow2

-  // W : intelligently/arbitrarily chosen factor of B -- always pow2

-  //

-

-  //

-  // There are several approaches to processing the commands:

-  //

-  // 1. B threads are responsible for one block. All threads broadcast

-  //    load a single command word. Workgroup size must be a facpb_elemsr of

-  //    B.

-  //

-  // 2. W threads process an entire block. W will typically be the

-  //    device's subgroup/warp/wave width. W threads broadcast load a

-  //    single command word.

-  //

-  // 3. W threads process W blocks. W threads load W command words and

-  //    process W blocks.

-  //

-  // Clearly (1) has low I/O intensity but will achieve high

-  // parallelism by activating the most possible threads. The downside

-  // of this kind of approach is that the kernel will occupy even a

-  // large GPU with low intensity work and reduce opportunities for

-  // concurrent kernel execution (of other kernels).

-  //

-  // See Vasily Volkov's CUDA presentation describing these tradeoffs.

-  //

-  // Note that there are many other approaches.  For example, similar

-  // pb_elems (1) but each thread loads a pow2 vector of block data.

-  //

-

-  // load the copied atomic read "base" from gmem

-  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];

-  // will always be less than 2^32

-  skc_uint const gid      = get_global_id(0);

-  // every subgroup/simd that will work on the block loads the same command

-  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;

-  // path builder data can be spread across two spans

-  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);

-

-  // no need pb_elems make this branchless

-  if (pb_idx >= pb_size)

-    pb_idx -= pb_size;

-

-  // broadcast load the command

-  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];

-

-  // what do we want pb_elems do with this block?

-  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);

-

-  // compute offset from rolling base to get index into block pool ring allocation

-  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);

-

-  // convert the pb_cmd's offset counter pb_elems a block id

-  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);

-

-#if 0

-  if (get_sub_group_local_id() == 0) {

-    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);

-    printf("< %8u >\n",block);

-  }

-#endif

-

-  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()

-  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;

-

-  // calculate bp_elems (to) / pb_elems (from)

-  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;

-  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;

-

-  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)

-    {

-#if 0

-      if (tid == 0)

-        printf("%3u, segs\n",bp_off);

-#endif

-      skc_copy_segs(bp_elems,

-                    bp_elems_idx,

-                    pb_elems,

-                    pb_elems_idx);

-    }

-  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)

-    {

-#if 0

-      if (tid == 0)

-        printf("%3u, NODE\n",bp_off);

-#endif

-      skc_copy_node(bp_elems, // to

-                    bp_elems_idx,

-                    bp_ids,

-                    bp_reads,

-                    bp_idx_mask,

-                    pb_elems, // from

-                    pb_elems_idx,

-                    pb_rolling);

-    }

-  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)

-    {

-#if 0

-      if (tid == 0)

-        printf("%3u, HEAD\n",bp_off);

-#endif

-      skc_copy_head(host_map,

-                    block,

-                    bp_elems, // to

-                    bp_elems_idx,

-                    bp_ids,

-                    bp_reads,

-                    bp_idx_mask,

-                    pb_elems, // from

-                    pb_elems_idx,

-                    pb_rolling);

-    }

-}

-

-//

-//

-//

-

-__kernel

-SKC_PATHS_ALLOC_KERNEL_ATTRIBS

-void

-skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,

-                       __global skc_uint          * const bp_alloc,

-                       skc_uint                     const bp_alloc_idx,

-                       skc_uint                     const pb_cmd_count)

-{

-  //

-  // allocate blocks in block pool

-  //

-  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);

-

-  // store in slot

-  bp_alloc[bp_alloc_idx] = reads;

-

-#if 0

-  printf("pc: %8u + %u\n",reads,pb_cmd_count);

-#endif

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "path.h"
+#include "block_pool_cl.h"
+#include "path_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#if 0
+
+//
+// SIMD AVX2
+//
+
+#define SKC_PATHS_COPY_WORDS_PER_ELEM          8
+#define SKC_PATHS_COPY_SUBGROUP_SIZE           1
+#define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
+
+typedef skc_uint8  skc_paths_copy_elem;
+typedef skc_uint8  skc_pb_idx_v;
+
+#define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
+
+#define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
+
+#endif
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
+#define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
+#define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
+
+// FIXME -- use SUBGROUP terminology everywhere
+#define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+#define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+#define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
+  (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
+
+// #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
+
+//
+//
+//
+
+//
+// BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
+//
+
+#define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
+
+#define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
+
+#define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
+
+#define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
+
+//
+//
+//
+
+skc_uint
+skc_sub_group_local_id()
+{
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  return get_sub_group_local_id();
+#else
+  return 0;
+#endif
+}
+
+//
+// convert an atomic read counter offset to a block id
+//
+
+skc_block_id_t
+skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
+                 skc_uint                        const bp_idx_mask,
+                 skc_uint                        const bp_reads,
+                 skc_uint                        const bp_off)
+{
+  skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
+
+  return bp_ids[bp_idx];
+}
+
+//
+//
+//
+
+void
+skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx)
+{
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
+    }
+
+#if 0
+  //
+  // NOTE THIS IS PRINTING 8 ROWS
+  //
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+  printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
+         (skc_uint)get_global_id(0),pb_elems_idx,
+         as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
+         as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
+#endif
+}
+
+//
+//
+//
+
+void
+skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // remap block id tags bp_elems the host-side rolling counter pb_elems a
+  // device-side block pool id
+  //
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
+
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // only convert if original elem is not invalid
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
+      
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+//
+//
+
+void
+skc_host_map_update(__global skc_uint * const host_map,
+                    skc_uint            const block,
+                    skc_paths_copy_elem const elem)
+{
+  //
+  // write first elem to map -- FIXME -- this is a little nasty
+  // because it relies on the the host handle always being the first
+  // word in the path header.
+  //
+  // OTOH, this is not unreasonable.  The alternative is to have a
+  // separate kernel initializing the map.
+  //
+#if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
+  if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
+#endif
+    {
+#if SKC_PATHS_COPY_ELEM_WORDS == 1
+      host_map[elem] = block; 
+#if 0
+      printf("[%u] = %u\n",elem,block);
+#endif
+#else
+      host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
+#endif
+    }
+}
+
+//
+//
+//
+
+void
+skc_copy_head(__global skc_uint                  * const host_map,
+              skc_uint                             const block,
+              __global skc_paths_copy_elem       * const bp_elems, // to
+              skc_uint                             const bp_elems_idx,
+              __global skc_block_id_t      const * const bp_ids,
+              skc_uint                             const bp_reads,
+              skc_uint                             const bp_idx_mask,
+              __global skc_paths_copy_elem const * const pb_elems, // from
+              skc_uint                             const pb_elems_idx,
+              skc_uint                             const pb_rolling)
+{
+  //
+  // if there are more path header words than there are
+  // threads-per-block then we can just copy the initial header words
+  //
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
+  for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
+
+      (bp_elems+bp_elems_idx)[ii] = elem;
+
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+    }
+#endif
+
+  //
+  // this is similar to copy node but the first H words of the path
+  // header are not modified and simply copied
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
+
+#if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
+      if (ii == 0) {
+        skc_host_map_update(host_map,block,elem);
+      }
+#endif
+      // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
+      skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // FIXME -- MIX MIX MIX MIX / SELECT
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
+        skc_block_id_t const b = bp_ids[bp_idx C];                      \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
+      }
+
+      // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem back
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+
+  //
+  // the remaining words are treated like a node
+  //
+  for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
+    {
+      // load block_id_tag words
+      skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
+
+      // calculate ahead of time
+      skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
+
+      //
+      // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
+      // will _always_ be safe as long as we don't use the loaded
+      // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
+      // of iterating over the vector components.
+      //
+
+      // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
+
+      // only convert if original elem is not invalid
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+      if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
+        skc_block_id_t const b = bp_ids[bp_idx C];              \
+        elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
+      }
+
+      // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
+
+      SKC_PATHS_COPY_ELEM_EXPAND();
+
+      // store the elem
+      (bp_elems+bp_elems_idx)[ii] = elem;
+    }
+}
+
+//
+// FIXME -- pack some of these constant integer args in a vec or struct
+//
+
+__kernel
+SKC_PATHS_COPY_KERNEL_ATTRIBS
+void
+skc_kernel_paths_copy
+(__global skc_uint                        * const host_map,
+
+ __global skc_block_id_t            const * const bp_ids,
+ __global skc_paths_copy_elem             * const bp_elems,
+ skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
+
+ __global skc_uint                  const * const bp_alloc,    // block pool ring base
+ skc_uint                                   const bp_alloc_idx,// which subbuf
+
+ __global union skc_tagged_block_id const * const pb_cmds,
+ __global skc_paths_copy_elem       const * const pb_elems,
+
+ skc_uint                                   const pb_size,     // # of commands/blocks in buffer
+ skc_uint                                   const pb_rolling,  // shifted rolling counter base
+
+ skc_uint                                   const pb_prev_from,
+ skc_uint                                   const pb_prev_span,
+ skc_uint                                   const pb_curr_from)
+{
+  //
+  // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
+  //
+  // - HEAD
+  // - NODE
+  // - SEGS
+  //
+  // THESE ARE SUBGROUP ORIENTED KERNELS
+  //
+  // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
+  //
+
+  //
+  // It's likely that peak bandwidth is achievable with a single
+  // workgroup.
+  //
+  // So let's keep the grids modestly sized and for simplicity and
+  // portability, let's assume that a single workgroup can perform all
+  // steps in the copy.
+  //
+  // Launch as large of a workgroup as possiblex
+  //
+  // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
+  // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
+  // 3. FOR EACH COMMAND:
+  //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
+  //      - NODE: CONVERT AND COPY B INDICES
+  //      - SEGS: BULK COPY
+  //
+  // B : number of words in block -- always pow2
+  // W : intelligently/arbitrarily chosen factor of B -- always pow2
+  //
+
+  //
+  // There are several approaches to processing the commands:
+  //
+  // 1. B threads are responsible for one block. All threads broadcast
+  //    load a single command word. Workgroup size must be a facpb_elemsr of
+  //    B.
+  //
+  // 2. W threads process an entire block. W will typically be the
+  //    device's subgroup/warp/wave width. W threads broadcast load a
+  //    single command word.
+  //
+  // 3. W threads process W blocks. W threads load W command words and
+  //    process W blocks.
+  //
+  // Clearly (1) has low I/O intensity but will achieve high
+  // parallelism by activating the most possible threads. The downside
+  // of this kind of approach is that the kernel will occupy even a
+  // large GPU with low intensity work and reduce opportunities for
+  // concurrent kernel execution (of other kernels).
+  //
+  // See Vasily Volkov's CUDA presentation describing these tradeoffs.
+  //
+  // Note that there are many other approaches.  For example, similar
+  // pb_elems (1) but each thread loads a pow2 vector of block data.
+  //
+
+  // load the copied atomic read "base" from gmem
+  skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
+  // will always be less than 2^32
+  skc_uint const gid      = get_global_id(0);
+  // every subgroup/simd that will work on the block loads the same command
+  skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
+  // path builder data can be spread across two spans
+  skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
+
+  // no need pb_elems make this branchless
+  if (pb_idx >= pb_size)
+    pb_idx -= pb_size;
+
+  // broadcast load the command
+  union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
+
+  // what do we want pb_elems do with this block?
+  skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
+
+  // compute offset from rolling base to get index into block pool ring allocation
+  skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
+
+  // convert the pb_cmd's offset counter pb_elems a block id
+  skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
+
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
+    printf("< %8u >\n",block);
+  }
+#endif
+
+  // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
+  skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
+
+  // calculate bp_elems (to) / pb_elems (from)
+  skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
+  skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
+
+  if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, segs\n",bp_off);
+#endif
+      skc_copy_segs(bp_elems,
+                    bp_elems_idx,
+                    pb_elems,
+                    pb_elems_idx);
+    }
+  else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, NODE\n",bp_off);
+#endif
+      skc_copy_node(bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+  else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
+    {
+#if 0
+      if (tid == 0)
+        printf("%3u, HEAD\n",bp_off);
+#endif
+      skc_copy_head(host_map,
+                    block,
+                    bp_elems, // to
+                    bp_elems_idx,
+                    bp_ids,
+                    bp_reads,
+                    bp_idx_mask,
+                    pb_elems, // from
+                    pb_elems_idx,
+                    pb_rolling);
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_PATHS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
+                       __global skc_uint          * const bp_alloc,
+                       skc_uint                     const bp_alloc_idx,
+                       skc_uint                     const pb_cmd_count)
+{
+  //
+  // allocate blocks in block pool
+  //
+  skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
+
+  // store in slot
+  bp_alloc[bp_alloc_idx] = reads;
+
+#if 0
+  printf("pc: %8u + %u\n",reads,pb_cmd_count);
+#endif
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
index 2aee5da..5441dcd 100644
--- a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
@@ -1,390 +1,390 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-// FIXME -- a pre-allocation step could load the path header quads and

-// total up the number of blocks in the workgroup or subgroup

-// minimizing the number of later atomics adds.

-//

-

-#include "block.h"

-#include "path.h"

-#include "common.h"

-#include "atomic_cl.h"

-#include "block_pool_cl.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)

-

-#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)

-

-#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)

-

-//

-//

-//

-

-#if   ( SKC_PATHS_RECLAIM_X == 1 )

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0

-

-#elif ( SKC_PATHS_RECLAIM_X == 2 )

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1

-

-#elif ( SKC_PATHS_RECLAIM_X == 4 )

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3

-

-#elif ( SKC_PATHS_RECLAIM_X == 8 )

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7

-

-#elif ( SKC_PATHS_RECLAIM_X == 16)

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()

-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15

-

-#else

-#error "MISSING SKC_PATHS_RECLAIM_X"

-#endif

-

-//

-// FIXME -- slate these for replacement

-//

-

-#define SKC_BROADCAST(E,S,I)                                            \

-  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \

-  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)

-

-#define SKC_BROADCAST_LAST(E,I)                 \

-  SKC_BROADCAST_LAST_HELPER(E,I)

-

-//

-// COMPILE-TIME PREDICATES

-//

-

-#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \

-  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \

-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \

-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \

-  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)

-

-#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \

-  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)

-

-//

-// RUN-TIME PREDICATES

-//

-

-#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \

-  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)

-

-//

-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL

-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK

-// COMBOS (NOT NECESSARILY POW2)

-//

-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR

-// UINT TYPE INSTEAD OF A ULONG.

-//

-

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint

-

-//

-//

-//

-

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)

-

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \

-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \

-   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))

-

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \

-  S = sub_group_scan_exclusive_add(C)

-

-#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \

-  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)

-

-//

-//

-//

-

-struct skc_reclaim

-{

-  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];

-};

-

-__kernel

-SKC_PATHS_RECLAIM_KERNEL_ATTRIBS

-void

-skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring

-                         __global skc_uint                * const bp_elems,    // block pool blocks

-                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics

-                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring

-                         __global skc_block_id_t const    * const map,         // path host-to-device map

-                         struct   skc_reclaim               const reclaim)     // array of host path ids

-{

-#if (__OPENCL_VERSION__ < 200)

-  skc_uint const reclaim_stride = get_num_sub_groups();

-#else

-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups

-#endif

-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();

-

-#if 0

-  //

-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT

-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL

-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE

-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.

-  //

-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)

-#endif

-    {

-      // get host path id

-      skc_path_h const path = reclaim.aN[reclaim_idx];

-

-      // get the path header block from the map

-      skc_block_id_t   id   = map[path];

-

-      //

-      // blindly load all of the head elements into registers

-      //

-      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];

-

-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-      //

-      // pick out count.nodes and count.prims from the header

-      //

-      skc_uint count_blocks, count_nodes;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \

-        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \

-      }                                                                 \

-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \

-        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \

-      }

-

-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-#if 0

-      if (get_sub_group_local_id() == 0) {

-        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);

-      }

-#endif

-

-      //

-      // acquire a span in the block pool ids ring for reclaimed ids

-      //

-      // FIXME count_blocks and atomic add can be done in same lane

-      //

-      skc_uint bp_ids_base = 0;

-

-      if (get_sub_group_local_id() == 0) {

-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);

-

-#if 0

-        printf("paths: bp_ids_base = %u\n",bp_ids_base);

-#endif

-      }

-

-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);

-

-      //

-      // shift away the tagged block id's tag

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \

-        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \

-      }

-

-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-      //

-      // swap current id with next

-      //

-      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)

-        {

-          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);

-

-          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;

-

-          id = next;

-        }

-

-      //

-      // - we'll skip subgroups that are entirely header

-      //

-      // - but we need to mark any header elements that partially fill

-      //   a subgroup as invalid tagged block ids

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \

-        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \

-          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \

-            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \

-          }                                             \

-        }                                               \

-      }

-

-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-      {

-        //

-        // count reclaimable blocks in each lane

-        //

-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \

-          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \

-        }

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // scan to find index of each block

-        //

-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );

-

-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);

-

-        //

-        // store blocks back to ring

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \

-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \

-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \

-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \

-          if (count > 0) {                                              \

-            bp_ids[bp_ids_idx] = h##I;                                  \

-          }                                                             \

-          skc_uint const total = index + count;                         \

-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \

-        }

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        // printf("P %7u ! %u\n",bp_ids_idx,h##I);

-      }

-

-      //

-      // we're done if it was just the header

-      //

-      if (count_nodes == 0)

-        return;

-

-      //

-      // otherwise, walk the nodes

-      //

-      do {

-        // id of next block is in last lane

-        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);

-

-        // get index of each element

-        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();

-

-        //

-        // blindly load all of the node elements into registers

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // shift away the tagged block id's tag

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // swap current id with next

-        //

-        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)

-          {

-            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);

-

-            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;

-

-            id = next;

-          }

-

-        //

-        // count reclaimable blocks in each lane

-        //

-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // scan to find index of each block

-        //

-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );

-

-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);

-

-        //

-        // store blocks back to ring

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \

-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \

-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \

-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \

-          if (count > 0) {                                              \

-            bp_ids[bp_ids_idx] = n##I;                                  \

-          }                                                             \

-          skc_uint const total = index + count;                         \

-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \

-        }

-

-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();

-

-        // printf("P %7u ! %u\n",bp_ids_idx,n##I);

-

-        // any more nodes?

-      } while (--count_nodes > 0);

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// FIXME -- a pre-allocation step could load the path header quads and
+// total up the number of blocks in the workgroup or subgroup
+// minimizing the number of later atomics adds.
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
+
+//
+//
+//
+
+#if   ( SKC_PATHS_RECLAIM_X == 1 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_PATHS_RECLAIM_X == 2 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_PATHS_RECLAIM_X == 4 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_PATHS_RECLAIM_X == 8 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_PATHS_RECLAIM_X == 16)
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_PATHS_RECLAIM_X"
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
+  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
+  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
+
+#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
+  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
+  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
+  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                         __global skc_uint                * const bp_elems,    // block pool blocks
+                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t const    * const map,         // path host-to-device map
+                         struct   skc_reclaim               const reclaim)     // array of host path ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host path id
+      skc_path_h const path = reclaim.aN[reclaim_idx];
+
+      // get the path header block from the map
+      skc_block_id_t   id   = map[path];
+
+      //
+      // blindly load all of the head elements into registers
+      //
+      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      skc_uint count_blocks, count_nodes;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+      }                                                                 \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
+      }
+#endif
+
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      // FIXME count_blocks and atomic add can be done in same lane
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+
+#if 0
+        printf("paths: bp_ids_base = %u\n",bp_ids_base);
+#endif
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // shift away the tagged block id's tag
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+        }
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as invalid tagged block ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
+          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
+            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,h##I);
+      }
+
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
+
+        // get index of each element
+        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+        //
+        // blindly load all of the node elements into registers
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // shift away the tagged block id's tag
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+          }
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,n##I);
+
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/place.cl b/src/compute/skc/platforms/cl_12/kernels/place.cl
index 92fa0a2..8866bdb 100644
--- a/src/compute/skc/platforms/cl_12/kernels/place.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/place.cl
@@ -1,871 +1,871 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "common.h"

-#include "raster.h"

-#include "atomic_cl.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)

-#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)

-

-//

-//

-//

-

-#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)

-#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK

-

-//

-//

-//

-

-#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)

-

-//

-//

-//

-

-#if   ( SKC_PLACE_X == 1 )

-#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()

-#define SKC_PLACE_EXPAND_I_LAST      0

-

-#elif ( SKC_PLACE_X == 2 )

-#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()

-#define SKC_PLACE_EXPAND_I_LAST      1

-

-#elif ( SKC_PLACE_X == 4 )

-#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()

-#define SKC_PLACE_EXPAND_I_LAST      3

-

-#elif ( SKC_PLACE_X == 8 )

-#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()

-#define SKC_PLACE_EXPAND_I_LAST      7

-

-#elif ( SKC_PLACE_X == 16)

-#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()

-#define SKC_PLACE_EXPAND_I_LAST      15

-#endif

-

-//

-// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE

-// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.

-//

-// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE

-// KERNELS USE DIFFERENT SUBGROUP SIZES.

-//

-// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE

-// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.

-//

-// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER

-// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY

-// ONLY SUPPORT A SUBGROUP SIZE OF 16.

-//

-

-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )

-

-#define SKC_PLACE_STRIDE_H(L)              (L)

-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)

-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)

-

-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1

-

-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)

-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)

-#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))

-

-#define SKC_PLACE_STRIDE_H(L)              (L)

-#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)

-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)

-

-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1

-

-#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)

-#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask

-

-#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))

-#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)

-#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)

-

-#endif

-

-//

-// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE

-// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)

-//

-

-#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)

-

-#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)

-

-#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)

-

-#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))

-

-

-//

-// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX

-//

-#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))

-#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))

-

-//

-// TTSK v2:

-//

-//  0                                       63

-//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |

-//  +---------+--------+---------+-----+-----+

-//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |

-//

-//

-// TTPK v2:

-//

-//  0                                    63

-//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |

-//  +---------+--------+------+-----+-----+

-//  |    27   | 1 (=1) |  12  | 12  | 12  |

-//

-//

-

-//

-// TTCK (32-BIT COMPARE) v1:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          30          |    1   |    1   |   18  |  7  |  7  |

-//

-//

-// TTCK (32-BIT COMPARE) v2:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          30          |    1   |    1   |   15  |  9  |  8  |

-//

-//

-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          27          |    1   |    1   |   18  |  9  |  8  |

-//

-

-union skc_subgroup_smem

-{

-  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE

-

-  struct {

-    struct {

-      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];

-      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];

-    } lo;

-

-    struct {

-      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];

-      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];

-    } hi;

-

-    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];

-  };

-

-};

-

-//

-// scatter scan max

-//

-static

-skc_int_v_t

-skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,

-                     skc_int_v_t                                 const iss,

-                     skc_int_v_t                                 const ess)

-{

-  //

-  // prefix sums determine which lanes we're going to work on next

-  //

-  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);

-  skc_int_v_t  const scratch_idx      = max(ess,0);

-

-  //

-  // SIMT

-  //

-

-  //

-  // zero the volatile smem scratchpad using vector syntax

-  //

-  smem->scratch[get_sub_group_local_id()] = ( 0 );

-

-  //

-  // store source lane at starting lane

-  //

-  if (is_scratch_store) {

-    smem->scratch[scratch_idx] = get_sub_group_local_id();

-  }

-

-  //

-  // propagate lanes to right using max scan

-  //

-  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];

-  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);

-

-  return source;

-}

-

-//

-//

-//

-

-static

-skc_bool

-skc_xk_clip(union skc_tile_clip const * const tile_clip,

-            skc_ttxk_t                * const xk)

-{

-  //

-  // clip the sk and pk keys

-  //

-  // if fully clipped then return false

-  //

-  // alternatively -- we can expand all these keys in place

-  //

-  // alternatively -- keep sk and pk keys segregated because sk

-  // represents the vast majority of keys and are easier to process.

-  // don't mess with the fastpath!

-  //

-  return false;

-}

-

-//

-//

-//

-

-static

-skc_ttck_t

-skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,

-             union skc_cmd_place              const    * const cmd,

-             skc_uint                                    const sk_idx)

-{

-  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0

-  skc_uint const hi = smem->hi.sk[sk_idx];

-

-  skc_ttck_t ck;

-

-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id

-

-  // FIXME -- x and y should already be clipped and shifted

-  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;

-  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;

-

-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;

-

-  return ck;

-}

-

-static

-skc_ttck_t

-skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,

-             union skc_cmd_place              const    * const cmd,

-             skc_uint                                    const pk_idx,

-             skc_uint                                    const dx)

-{

-  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1

-  skc_uint const hi = smem->hi.pk[pk_idx];

-

-  skc_ttck_t ck;

-

-  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id

-

-  // FIXME -- x and y should already be clipped and shifted

-  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;

-  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;

-

-  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;

-

-  return ck;

-}

-

-//

-//

-//

-

-static

-void

-skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,

-               __global skc_ttck_t                       * const ck_extent,

-               __local union skc_subgroup_smem  volatile * const smem,

-               union skc_cmd_place              const    * const cmd,

-               skc_uint                         const            sk)

-{

-  //

-  // Pretty sure you can never ever have an sk count equal to 0

-  //

-  skc_uint ck_base = 0;

-

-  // last lane performs the block pool allocation with an atomic increment

-  if (get_sub_group_local_id() == 0) {

-    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);

-  }

-

-  // broadcast base to all lanes

-  ck_base = sub_group_broadcast(ck_base,0);

-

-  // convert sk keys to ck keys

-  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)

-    {

-      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);

-    }

-}

-

-//

-//

-//

-

-static

-skc_int

-skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,

-                  skc_uint                                    const idx)

-{

-  skc_uint const lo      = smem->lo.pk[idx];

-  skc_uint const hi      = smem->hi.pk[idx];

-

-  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;

-  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;

-

-  return (span_lo | span_hi) + 1;

-}

-

-//

-//

-//

-

-static

-void

-skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,

-               __global skc_ttck_t                       * const ck_extent,

-               __local union skc_subgroup_smem  volatile * const smem,

-               union skc_cmd_place              const    * const cmd,

-               skc_uint                         const            pk)

-{

-  // bail out if pk queue is empty

-  if (pk == 0)

-    return;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("%u\n",pk);

-#endif

-

-  //

-  // FIXME -- this nested loop iterates over the queue processing a

-  // subgroup of 64-bit keys at a time.  This is probably not the most

-  // efficient approach so investigate how to store and iterate over a

-  // wider than subgroup (node-sized) queue of keys.

-  //

-

-  // round up so we work with full subgroups

-  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;

-  skc_uint       ii    = 0;

-

-  // nested loop that expands all ttpk keys

-#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)

-  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)

-#endif

-    {

-      skc_uint idx  = ii + get_sub_group_local_id();

-      skc_int  span = 0;

-

-      // how many tiles does this ttpk span?

-      if (idx < pk)

-        span = skc_ttpk_get_span(smem,idx);

-

-      // we need inclusive, exclusive and total

-      skc_int iss = sub_group_scan_inclusive_add(span);

-      skc_int ess = iss - span;

-      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);

-

-      // printf("%u : %u\n",span,iss);

-      // continue;

-

-      // atomically allocate space for the pk keys

-      skc_uint ck_base = 0;

-

-      // last lane performs the block pool allocation with an atomic increment

-      if (get_sub_group_local_id() == 0) {

-        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);

-      }

-

-      // broadcast atomically allocated extent base to all lanes

-      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();

-

-      //

-      // FIXME -- this loop would probably be faster if the ttpk keys

-      // were held in registers and accessed with shuffles instead of

-      // SMEM loads

-      //

-

-      //

-      // loop until there are no more expanded pk keys

-      //

-      while (true)

-        {

-          skc_int const source = skc_scatter_scan_max(smem,iss,ess);

-          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);

-

-          // store valid ck keys to gmem

-          if (get_sub_group_local_id() < rem) {

-            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);

-          }

-

-          // decrement remainder

-          rem -= SKC_PLACE_SUBGROUP_SIZE;

-

-          if (rem <= 0)

-            break;

-

-          // increment/decrement indices

-          ck_idx += SKC_PLACE_SUBGROUP_SIZE;

-          iss    -= SKC_PLACE_SUBGROUP_SIZE;

-          ess    -= SKC_PLACE_SUBGROUP_SIZE;

-        }

-    }

-}

-

-//

-//

-//

-

-static

-skc_uint

-skc_ballot(skc_uint * const xk, skc_uint const is_xk)

-{

-#if 0

-  //

-  // FIXME -- when available, this should use the idiom:

-  //

-  //   ballot() + lane_mask_less_than_or_equal + popcount()

-  //

-  // Supported by:

-  //

-  //   - Vulkan 1.1 / SPIR-V 1.3

-  //   - CUDA

-  //   - AVX2 (SSE*?)

-  //

-#else

-  //

-  // otherwise, emulate with an inclusive scan (yuk)

-  //

-  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);

-

-  skc_uint const xk_idx = *xk + prefix - is_xk;

-

-  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);

-

-#if 0

-  printf("< %3u >\n",xk_idx);

-#endif

-

-  return xk_idx;

-#endif

-}

-

-//

-//

-//

-__kernel

-SKC_PLACE_KERNEL_ATTRIBS

-void

-skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,

-                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,

-                 __global skc_ttck_t                   * const ck_extent,

-                 __global union skc_cmd_place const    * const cmds,

-                 __global skc_block_id_t               * const map,

-                 skc_uint4                               const clip,

-                 skc_uint                                const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )

-  __local union skc_subgroup_smem  volatile                smem[1];

-#else

-  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];

-  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();

-#endif

-

-  //

-  // This is a subgroup-centric kernel

-  //

-  // Which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing

-  // get_group_id(0) as a uniform but the alternative calculation used

-  // when there are multiple subgroups per workgroup is not

-  // cooperating and driving spillage elsewhere.

-  //

-  // Test the raster's translated bounds against the composition's

-  // tile clip

-  //

-  // There are 3 cases:

-  //

-  //   - the raster is completely clipped -> return

-  //   - the raster is partially  clipped -> all keys must clipped

-  //   - the raster is not        clipped -> no keys are tested

-  //

-  //

-  // There are at least 4 implementations of place and we want to

-  // special-case them as much as possible so that, at the least, the

-  // fastpath remains fast.

-  //

-  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP

-  //

-  //  - implement CLIPPED + NO TRANSLATION path

-  //

-  //  - implement NO CLIP +    TRANSLATION path

-  //

-  //  - implement CLIPPED +    TRANSLATION path

-  //

-  //

-  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin

-  // 12:12:8 integer where:

-  //

-  //  12: ttsk

-  //  12: ttpk

-  //   8: /dev/null -- clipped or invalid key

-  //

-  // Three kinds of nodes in a raster's list:

-  //

-  //  - the head node

-  //  - an internal node

-  //  - the final node

-  //

-

-#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )

-  skc_uint const cmd_idx = get_group_id(0);

-#else

-  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  // load command

-  union skc_cmd_place const cmd = cmds[cmd_idx];

-

-  // get the raster header from the raster host id -- scalar

-  skc_block_id_t            id  = map[cmd.raster_h];

-

-  //

-  // load all of the head block ttxk keys into registers

-  //

-  // FIXME -- this pattern lends itself to using the higher

-  // performance Intel GEN block load instructions

-  //

-  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                 \

-  union skc_raster_node_elem const h##I = {                     \

-    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \

-               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \

-  };

-

-  SKC_PLACE_EXPAND();

-

-  //

-  // load raster header counts -- we only need the "nodes" and "keys"

-  // words but the keys we loaded are doublewords.

-  //

-  // FIXME -- this can be made portable with compile-time macro expansion

-  //

-  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES

-  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS

-

-  //

-  //

-  //

-#if 0

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                 \

-  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \

-         nodes,keys,                                            \

-         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \

-         h##I.u32v2.hi,h##I.u32v2.lo,                           \

-         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);

-

-  SKC_PLACE_EXPAND();

-#endif

-

-  //

-#if 0

-  if (get_sub_group_local_id() == 0) {

-    printf("place: %u / %u / %u\n",head_id,nodes,keys);

-  }

-#endif

-

-  {

-    //

-    // classify every key in the header

-    //

-    // keys: 0 is not a key / 1 is a key

-    // skpk: 0 is sk        / 1 is pk

-    //

-    skc_uint bits_keys = 0;

-    skc_uint bits_skpk = 0;

-

-    //

-    // calculate bits_keys

-    //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \

-      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \

-      if (idx < keys) {                                                 \

-        bits_keys |= (1u << I);                                         \

-      }                                                                 \

-      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \

-        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \

-          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \

-            bits_keys &= ~(1u << I);                                    \

-          }                                                             \

-        }                                                               \

-      }                                                                 \

-    }

-

-    SKC_PLACE_EXPAND();

-

-    //

-    // blindly calculate bits_skpk

-    //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \

-      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \

-    }

-

-    SKC_PLACE_EXPAND();

-

-#if 0

-    printf("%2X : %2X\n",bits_keys,bits_skpk);

-#endif

-

-    //

-    // next pointer is last element of last row.  save it now because

-    // this might be recognized as a subgroup-uniform/scalar.

-    //

-    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);

-

-    //

-    // append SK keys first

-    //

-    skc_uint const bits_sk = bits_keys & ~bits_skpk;

-    skc_uint       sk      = 0;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                 \

-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \

-      skc_uint is_sk  = (bits_sk >> I) & 1;     \

-      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \

-      if (is_sk) {                              \

-        smem->lo.sk[sk_idx] = h##I.xk.lo;       \

-        smem->hi.sk[sk_idx] = h##I.xk.hi;       \

-      }                                         \

-    }

-

-    SKC_PLACE_EXPAND();

-

-    //

-    // append PK keys next

-    //

-    skc_uint const bits_pk = bits_keys & bits_skpk;

-    skc_uint       pk      = 0;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                 \

-    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \

-      skc_uint is_pk  = (bits_pk >> I) & 1;     \

-      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \

-      if (is_pk) {                              \

-        smem->lo.pk[pk_idx] = h##I.xk.lo;       \

-        smem->hi.pk[pk_idx] = h##I.xk.hi;       \

-      }                                         \

-    }

-

-    SKC_PLACE_EXPAND();

-

-#if 0

-    printf("%2u * %2u\n",sk,pk);

-#endif

-    //

-    // flush the keys

-    //

-    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);

-    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);

-  }

-

-  //

-  // we're done if there was only a head node

-  //

-  if (nodes == 0)

-    return;

-

-  //

-  // decrement keys

-  //

-  keys -= SKC_RASTER_HEAD_COUNT_KEYS;

-

-  //

-  // otherwise, append keys in trailing nodes to smem

-  //

-  while (true)

-    {

-      //

-      // load all of the node block ttxk keys into registers

-      //

-      // FIXME -- this pattern lends itself to using the higher

-      // performance Intel GEN block load instructions

-      //

-      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      union skc_raster_node_elem const n##I = {                         \

-        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \

-                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \

-      };

-

-      SKC_PLACE_EXPAND();

-

-#if 0

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \

-             nodes,keys,                                                \

-             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \

-             n##I.u32v2.hi,n##I.u32v2.lo,                               \

-             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);

-

-      SKC_PLACE_EXPAND();

-#endif

-

-      //

-      // classify every key in the header

-      //

-      // keys: 0 is not a key / 1 is a key

-      // skpk: 0 is sk        / 1 is pk

-      //

-      skc_uint bits_keys = 0;

-      skc_uint bits_skpk = 0;

-

-      //

-      // calculate bits_keys

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \

-        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \

-        if (idx < keys) {                                               \

-          bits_keys |= (1u << I);                                       \

-        }                                                               \

-        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \

-          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \

-            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \

-              bits_keys &= ~(1u << I);                                  \

-            }                                                           \

-          }                                                             \

-        }                                                               \

-      }

-

-      SKC_PLACE_EXPAND();

-

-      //

-      // blindly calculate bits_skpk

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \

-        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \

-      }

-

-      SKC_PLACE_EXPAND();

-

-#if 0

-      printf("%2X : %2X\n",bits_keys,bits_skpk);

-#endif

-

-      //

-      // next pointer is last element of last row.  save it now because

-      // this might be recognized as a subgroup-uniform/scalar.

-      //

-      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);

-

-      //

-      // append SK keys first

-      //

-      skc_uint const bits_sk = bits_keys & ~bits_skpk;

-      skc_uint       sk      = 0;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                       \

-        skc_uint is_sk  = (bits_sk >> I) & 1;           \

-        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \

-        if (is_sk) {                                    \

-          smem->lo.sk[sk_idx] = n##I.xk.lo;             \

-          smem->hi.sk[sk_idx] = n##I.xk.hi;             \

-        }                                               \

-      }

-

-      SKC_PLACE_EXPAND();

-

-      //

-      // append PK keys next

-      //

-      skc_uint const bits_pk = bits_keys & bits_skpk;

-      skc_uint       pk      = 0;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                       \

-        skc_uint is_pk  = (bits_pk >> I) & 1;           \

-        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \

-        if (is_pk) {                                    \

-          smem->lo.pk[pk_idx] = n##I.xk.lo;             \

-          smem->hi.pk[pk_idx] = n##I.xk.hi;             \

-        }                                               \

-      }

-

-      SKC_PLACE_EXPAND();

-

-#if 0

-    printf("%2u * %2u\n",sk,pk);

-#endif

-      //

-      // if total for either the sk or pk queue reaches the

-      // highwater mark then flush it to the extent

-      //

-      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);

-      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);

-

-      //

-      // if this was the last node then we're done

-      //

-      if (--nodes == 0)

-        return;

-

-      //

-      // otherwise decrement keys

-      //

-      keys -= SKC_RASTER_NODE_COUNT_KEYS;

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
+#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
+
+//
+//
+//
+
+#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_PLACE_X == 1 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
+#define SKC_PLACE_EXPAND_I_LAST      0
+
+#elif ( SKC_PLACE_X == 2 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
+#define SKC_PLACE_EXPAND_I_LAST      1
+
+#elif ( SKC_PLACE_X == 4 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
+#define SKC_PLACE_EXPAND_I_LAST      3
+
+#elif ( SKC_PLACE_X == 8 )
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
+#define SKC_PLACE_EXPAND_I_LAST      7
+
+#elif ( SKC_PLACE_X == 16)
+#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
+#define SKC_PLACE_EXPAND_I_LAST      15
+#endif
+
+//
+// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
+// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
+//
+// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
+// KERNELS USE DIFFERENT SUBGROUP SIZES.
+//
+// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
+// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
+//
+// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
+// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
+// ONLY SUPPORT A SUBGROUP SIZE OF 16.
+//
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
+#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
+
+#define SKC_PLACE_STRIDE_H(L)              (L)
+#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
+#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
+#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
+
+#endif
+
+//
+// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
+// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
+//
+
+#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
+
+#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
+
+#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+
+
+//
+// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
+//
+#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
+#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
+
+//
+// TTSK v2:
+//
+//  0                                       63
+//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
+//  +---------+--------+---------+-----+-----+
+//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
+//
+//
+// TTPK v2:
+//
+//  0                                    63
+//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
+//  +---------+--------+------+-----+-----+
+//  |    27   | 1 (=1) |  12  | 12  | 12  |
+//
+//
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+union skc_subgroup_smem
+{
+  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
+
+  struct {
+    struct {
+      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } lo;
+
+    struct {
+      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
+      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
+    } hi;
+
+    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
+  };
+
+};
+
+//
+// scatter scan max
+//
+static
+skc_int_v_t
+skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
+                     skc_int_v_t                                 const iss,
+                     skc_int_v_t                                 const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
+  skc_int_v_t  const scratch_idx      = max(ess,0);
+
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->scratch[get_sub_group_local_id()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store) {
+    smem->scratch[scratch_idx] = get_sub_group_local_id();
+  }
+
+  //
+  // propagate lanes to right using max scan
+  //
+  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
+  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
+
+  return source;
+}
+
+//
+//
+//
+
+static
+skc_bool
+skc_xk_clip(union skc_tile_clip const * const tile_clip,
+            skc_ttxk_t                * const xk)
+{
+  //
+  // clip the sk and pk keys
+  //
+  // if fully clipped then return false
+  //
+  // alternatively -- we can expand all these keys in place
+  //
+  // alternatively -- keep sk and pk keys segregated because sk
+  // represents the vast majority of keys and are easier to process.
+  // don't mess with the fastpath!
+  //
+  return false;
+}
+
+//
+//
+//
+
+static
+skc_ttck_t
+skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const sk_idx)
+{
+  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
+  skc_uint const hi = smem->hi.sk[sk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+static
+skc_ttck_t
+skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
+             union skc_cmd_place              const    * const cmd,
+             skc_uint                                    const pk_idx,
+             skc_uint                                    const dx)
+{
+  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
+  skc_uint const hi = smem->hi.pk[pk_idx];
+
+  skc_ttck_t ck;
+
+  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
+
+  // FIXME -- x and y should already be clipped and shifted
+  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
+  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
+
+  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
+
+  return ck;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            sk)
+{
+  //
+  // Pretty sure you can never ever have an sk count equal to 0
+  //
+  skc_uint ck_base = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (get_sub_group_local_id() == 0) {
+    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
+  }
+
+  // broadcast base to all lanes
+  ck_base = sub_group_broadcast(ck_base,0);
+
+  // convert sk keys to ck keys
+  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
+    {
+      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
+    }
+}
+
+//
+//
+//
+
+static
+skc_int
+skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
+                  skc_uint                                    const idx)
+{
+  skc_uint const lo      = smem->lo.pk[idx];
+  skc_uint const hi      = smem->hi.pk[idx];
+
+  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
+  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
+
+  return (span_lo | span_hi) + 1;
+}
+
+//
+//
+//
+
+static
+void
+skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
+               __global skc_ttck_t                       * const ck_extent,
+               __local union skc_subgroup_smem  volatile * const smem,
+               union skc_cmd_place              const    * const cmd,
+               skc_uint                         const            pk)
+{
+  // bail out if pk queue is empty
+  if (pk == 0)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("%u\n",pk);
+#endif
+
+  //
+  // FIXME -- this nested loop iterates over the queue processing a
+  // subgroup of 64-bit keys at a time.  This is probably not the most
+  // efficient approach so investigate how to store and iterate over a
+  // wider than subgroup (node-sized) queue of keys.
+  //
+
+  // round up so we work with full subgroups
+  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
+  skc_uint       ii    = 0;
+
+  // nested loop that expands all ttpk keys
+#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
+  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
+#endif
+    {
+      skc_uint idx  = ii + get_sub_group_local_id();
+      skc_int  span = 0;
+
+      // how many tiles does this ttpk span?
+      if (idx < pk)
+        span = skc_ttpk_get_span(smem,idx);
+
+      // we need inclusive, exclusive and total
+      skc_int iss = sub_group_scan_inclusive_add(span);
+      skc_int ess = iss - span;
+      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
+
+      // printf("%u : %u\n",span,iss);
+      // continue;
+
+      // atomically allocate space for the pk keys
+      skc_uint ck_base = 0;
+
+      // last lane performs the block pool allocation with an atomic increment
+      if (get_sub_group_local_id() == 0) {
+        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
+      }
+
+      // broadcast atomically allocated extent base to all lanes
+      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
+
+      //
+      // FIXME -- this loop would probably be faster if the ttpk keys
+      // were held in registers and accessed with shuffles instead of
+      // SMEM loads
+      //
+
+      //
+      // loop until there are no more expanded pk keys
+      //
+      while (true)
+        {
+          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
+          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
+
+          // store valid ck keys to gmem
+          if (get_sub_group_local_id() < rem) {
+            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
+          }
+
+          // decrement remainder
+          rem -= SKC_PLACE_SUBGROUP_SIZE;
+
+          if (rem <= 0)
+            break;
+
+          // increment/decrement indices
+          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
+          iss    -= SKC_PLACE_SUBGROUP_SIZE;
+          ess    -= SKC_PLACE_SUBGROUP_SIZE;
+        }
+    }
+}
+
+//
+//
+//
+
+static
+skc_uint
+skc_ballot(skc_uint * const xk, skc_uint const is_xk)
+{
+#if 0
+  //
+  // FIXME -- when available, this should use the idiom:
+  //
+  //   ballot() + lane_mask_less_than_or_equal + popcount()
+  //
+  // Supported by:
+  //
+  //   - Vulkan 1.1 / SPIR-V 1.3
+  //   - CUDA
+  //   - AVX2 (SSE*?)
+  //
+#else
+  //
+  // otherwise, emulate with an inclusive scan (yuk)
+  //
+  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
+
+  skc_uint const xk_idx = *xk + prefix - is_xk;
+
+  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
+
+#if 0
+  printf("< %3u >\n",xk_idx);
+#endif
+
+  return xk_idx;
+#endif
+}
+
+//
+//
+//
+__kernel
+SKC_PLACE_KERNEL_ATTRIBS
+void
+skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
+                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
+                 __global skc_ttck_t                   * const ck_extent,
+                 __global union skc_cmd_place const    * const cmds,
+                 __global skc_block_id_t               * const map,
+                 skc_uint4                               const clip,
+                 skc_uint                                const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem  volatile                smem[1];
+#else
+  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+
+  //
+  // This is a subgroup-centric kernel
+  //
+  // Which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+  // Test the raster's translated bounds against the composition's
+  // tile clip
+  //
+  // There are 3 cases:
+  //
+  //   - the raster is completely clipped -> return
+  //   - the raster is partially  clipped -> all keys must clipped
+  //   - the raster is not        clipped -> no keys are tested
+  //
+  //
+  // There are at least 4 implementations of place and we want to
+  // special-case them as much as possible so that, at the least, the
+  // fastpath remains fast.
+  //
+  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
+  //
+  //  - implement CLIPPED + NO TRANSLATION path
+  //
+  //  - implement NO CLIP +    TRANSLATION path
+  //
+  //  - implement CLIPPED +    TRANSLATION path
+  //
+  //
+  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
+  // 12:12:8 integer where:
+  //
+  //  12: ttsk
+  //  12: ttpk
+  //   8: /dev/null -- clipped or invalid key
+  //
+  // Three kinds of nodes in a raster's list:
+  //
+  //  - the head node
+  //  - an internal node
+  //  - the final node
+  //
+
+#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const cmd_idx = get_group_id(0);
+#else
+  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  // load command
+  union skc_cmd_place const cmd = cmds[cmd_idx];
+
+  // get the raster header from the raster host id -- scalar
+  skc_block_id_t            id  = map[cmd.raster_h];
+
+  //
+  // load all of the head block ttxk keys into registers
+  //
+  // FIXME -- this pattern lends itself to using the higher
+  // performance Intel GEN block load instructions
+  //
+  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  union skc_raster_node_elem const h##I = {                     \
+    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
+               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
+  };
+
+  SKC_PLACE_EXPAND();
+
+  //
+  // load raster header counts -- we only need the "nodes" and "keys"
+  // words but the keys we loaded are doublewords.
+  //
+  // FIXME -- this can be made portable with compile-time macro expansion
+  //
+  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+  //
+  //
+  //
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                 \
+  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
+         nodes,keys,                                            \
+         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
+         h##I.u32v2.hi,h##I.u32v2.lo,                           \
+         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+  SKC_PLACE_EXPAND();
+#endif
+
+  //
+#if 0
+  if (get_sub_group_local_id() == 0) {
+    printf("place: %u / %u / %u\n",head_id,nodes,keys);
+  }
+#endif
+
+  {
+    //
+    // classify every key in the header
+    //
+    // keys: 0 is not a key / 1 is a key
+    // skpk: 0 is sk        / 1 is pk
+    //
+    skc_uint bits_keys = 0;
+    skc_uint bits_skpk = 0;
+
+    //
+    // calculate bits_keys
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
+      if (idx < keys) {                                                 \
+        bits_keys |= (1u << I);                                         \
+      }                                                                 \
+      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
+        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
+          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
+            bits_keys &= ~(1u << I);                                    \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // blindly calculate bits_skpk
+    //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
+      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+    //
+    // next pointer is last element of last row.  save it now because
+    // this might be recognized as a subgroup-uniform/scalar.
+    //
+    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+    //
+    // append SK keys first
+    //
+    skc_uint const bits_sk = bits_keys & ~bits_skpk;
+    skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_sk  = (bits_sk >> I) & 1;     \
+      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
+      if (is_sk) {                              \
+        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
+        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+    //
+    // append PK keys next
+    //
+    skc_uint const bits_pk = bits_keys & bits_skpk;
+    skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
+      skc_uint is_pk  = (bits_pk >> I) & 1;     \
+      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
+      if (is_pk) {                              \
+        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
+        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
+      }                                         \
+    }
+
+    SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+    //
+    // flush the keys
+    //
+    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+  }
+
+  //
+  // we're done if there was only a head node
+  //
+  if (nodes == 0)
+    return;
+
+  //
+  // decrement keys
+  //
+  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
+
+  //
+  // otherwise, append keys in trailing nodes to smem
+  //
+  while (true)
+    {
+      //
+      // load all of the node block ttxk keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_raster_node_elem const n##I = {                         \
+        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
+                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
+      };
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
+             nodes,keys,                                                \
+             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
+             n##I.u32v2.hi,n##I.u32v2.lo,                               \
+             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
+
+      SKC_PLACE_EXPAND();
+#endif
+
+      //
+      // classify every key in the header
+      //
+      // keys: 0 is not a key / 1 is a key
+      // skpk: 0 is sk        / 1 is pk
+      //
+      skc_uint bits_keys = 0;
+      skc_uint bits_skpk = 0;
+
+      //
+      // calculate bits_keys
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
+        if (idx < keys) {                                               \
+          bits_keys |= (1u << I);                                       \
+        }                                                               \
+        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
+          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
+            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
+              bits_keys &= ~(1u << I);                                  \
+            }                                                           \
+          }                                                             \
+        }                                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // blindly calculate bits_skpk
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+      printf("%2X : %2X\n",bits_keys,bits_skpk);
+#endif
+
+      //
+      // next pointer is last element of last row.  save it now because
+      // this might be recognized as a subgroup-uniform/scalar.
+      //
+      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
+
+      //
+      // append SK keys first
+      //
+      skc_uint const bits_sk = bits_keys & ~bits_skpk;
+      skc_uint       sk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_sk  = (bits_sk >> I) & 1;           \
+        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
+        if (is_sk) {                                    \
+          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
+          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+      //
+      // append PK keys next
+      //
+      skc_uint const bits_pk = bits_keys & bits_skpk;
+      skc_uint       pk      = 0;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                       \
+        skc_uint is_pk  = (bits_pk >> I) & 1;           \
+        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
+        if (is_pk) {                                    \
+          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
+          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
+        }                                               \
+      }
+
+      SKC_PLACE_EXPAND();
+
+#if 0
+    printf("%2u * %2u\n",sk,pk);
+#endif
+      //
+      // if total for either the sk or pk queue reaches the
+      // highwater mark then flush it to the extent
+      //
+      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
+      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
+
+      //
+      // if this was the last node then we're done
+      //
+      if (--nodes == 0)
+        return;
+
+      //
+      // otherwise decrement keys
+      //
+      keys -= SKC_RASTER_NODE_COUNT_KEYS;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/prefix.cl b/src/compute/skc/platforms/cl_12/kernels/prefix.cl
index 21a5169..ae3397c 100644
--- a/src/compute/skc/platforms/cl_12/kernels/prefix.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl
@@ -1,1041 +1,1041 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "block.h"

-#include "raster.h"

-#include "atomic_cl.h"

-#include "raster_builder_cl_12.h"

-#include "device_cl_12.h"

-

-//

-// INPUT:

-//

-//   TTRK (64-BIT COMPARE)

-//

-//    0                                  63

-//    | TTSB ID |   X  |   Y  | COHORT ID |

-//    +---------+------+------+-----------+

-//    |    27   |  12  |  12  |     13    |

-//

-//

-//   TTRK (32-BIT COMPARE)

-//

-//    0                                        63

-//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |

-//    +---------+-----+------+------+-----------+

-//    |    27   |  5  |  12  |  12  |     8     |

-//

-//

-// OUTPUT:

-//

-//   TTSK v2:

-//

-//    0                                     63

-//    | TTSB ID | PREFIX |  N/A |  X |  Y |

-//    +---------+--------+------+----+----+

-//    |    27   | 1 (=0) |  12  | 12 | 12 |

-//

-//

-//   TTPK v1:

-//

-//    0                                        63

-//    | TTPB ID | ALL ZEROES | SPAN |  X  |  Y  |

-//    +---------+------------+------+-----+-----+

-//    |    27   |      1     |  12  | 12  | 12  |

-//

-//

-//   TTPK v2:

-//

-//    0                                       63

-//    | TTPB ID | PREFIX | SPAN |  X  |  Y  |

-//    +---------+--------+------+-----+-----+

-//    |    27   | 1 (=1) |  12  | 12  | 12  |

-//

-

-#define SKC_PREFIX_SUBGROUP_MASK  (SKC_PREFIX_SUBGROUP_SIZE - 1)

-

-//

-// smem accumulator

-//

-

-union skc_subgroup_accum

-{

-  struct {

-    SKC_ATOMIC_INT        ttp[SKC_TILE_HEIGHT];

-  } atomic;

-

-  struct {

-    skc_ttp_t             ttp[SKC_TILE_HEIGHT];

-  } aN;

-

-  struct {

-    SKC_PREFIX_TTP_V      ttp[SKC_PREFIX_SUBGROUP_SIZE];

-  } vN;

-

-  struct {

-    SKC_PREFIX_SMEM_ZERO  ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];

-  } zero;

-};

-

-//

-//

-//

-

-struct skc_subgroup_smem

-{

-  // prefix accumulator

-  union skc_subgroup_accum accum;

-};

-

-//

-//

-//

-

-static

-skc_uint

-skc_subgroup_lane()

-{

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  return get_sub_group_local_id();

-#else

-  return 0;

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_PREFIX_TTS_V_BITFIELD

-skc_tts_get_dy(skc_tts_v_t const ttsv)

-{

-  // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]

-  SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;

-

-  return dy - (~ttsv >> 31);

-}

-

-static

-SKC_PREFIX_TTS_V_BITFIELD

-skc_tts_get_py(skc_tts_v_t const ttsv)

-{

-  return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);

-}

-

-//

-//

-//

-

-static

-void

-skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)

-{

-  // get "altitude"

-  SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);

-

-  // get the y pixel coordinate

-  SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);

-

-  //

-  // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?

-  //

-  // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op

-  //

-

-#if 0

-  if (tts_v != SKC_TTS_INVALID)

-    printf("< %08X = %u : %d >\n",tts_v,py,dy); 

-#endif

-

-  //

-  // scatter-add the "altitude" to accumulator

-  //

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD

-  //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                         \

-  if (tts_v C != SKC_TTS_INVALID) {                                     \

-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \

-  }

-

-#else

-  //

-  // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS

-  //

-  // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT

-  //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                 \

-  if (tts_v C == SKC_TTS_INVALID)               \

-    return;                                     \

-  smem->accum.aN.ttp[py C] = dy C;

-#endif

-

-  SKC_PREFIX_TTS_VECTOR_INT_EXPAND();

-}

-

-//

-// The implication here is that if our device configuration has a

-// rectangular 1:2 tile then we need a block size of at least 2

-// subblocks. The subblock size of course needs to match the length of

-// the smallest tile side.

-//

-

-static

-void

-skc_accum_flush(__local struct skc_subgroup_smem * const smem,

-                __global skc_bp_elem_t           * const bp_elems,

-                skc_block_id_t                     const pb_id)

-{

-  // load the ttp elements

-  SKC_PREFIX_TTP_V const ttp_v  = smem->accum.vN.ttp[get_sub_group_local_id()];

-  skc_uint         const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();

-  

-#if   ( SKC_TILE_RATIO == 1 )

-

-  bp_elems[offset] = ttp_v;

-

-#elif ( SKC_TILE_RATIO == 2 )

-

-  vstore2(ttp_v,offset,bp_elems);

-

-#else

-

-#error("tile ratio greater than 2 not supported")

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_accum_reset(__local struct skc_subgroup_smem * const smem)

-{

-  for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)

-    smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );

-}

-

-//

-// get next sk key

-//

-

-static

-skc_ttsk_s_t

-skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,

-                    skc_uint     * const sk_next,

-                    skc_int      * const rkpk_rem)

-{

-  // decrement count

-  *rkpk_rem -= 1;

-

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT with subgroup support is easy

-  //

-  // SIMT without subgroup support can always emulate with smem

-  //

-#if 0

-  //

-  // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly

-  // broadcast a uint2 cast to a long. It was probably bad to do this

-  // anyway without a union wrapping the TTSK scalar type.

-  //

-  // Consider creating a union { ulong; uint2 } at a later date --

-  // probably no need to ever do this unless it makes broadcast faster

-  // which is unlikely since it will probably be implemented as 2

-  // 32-bit broadcasts.

-  //

-  // Additionally, the TTRK and TTXK key bitfield sizes are probably

-  // cast in stone and we aren't going to change them no matter

-  // architecture we're on.

-  //

-  skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);

-#else

-  skc_ttsk_s_t sk_s;

-

-  sk_s.lo   = sub_group_broadcast(sk_v->lo,*sk_next);

-  sk_s.hi   = sub_group_broadcast(sk_v->hi,*sk_next);

-  *sk_next += 1;

-#endif

-

-#else

-  //

-  // SIMD will always grab component .s0 and then rotate the vector

-  //

-  sk_s = ( sk_v->s0 );

-

-  skc_ttsk_v_rotate_down(sk_v);

-

-#endif

-

-  return sk_s;

-}

-

-//

-//

-//

-

-static

-skc_raster_yx_s

-skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)

-{

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT with subgroup support is easy

-  //

-  // SIMT without subgroup support can always emulate with smem

-  //

-  skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);

-

-#else

-  //

-  // SIMD will always grab component .s0 and then rotate the vector

-  //

-  skc_raster_yx_s const yx_s = ( sk_v->s0.hi );

-

-#endif

-

-  return yx_s;

-}

-

-//

-// mask off ttsb id

-//

-

-static

-skc_block_id_s_t

-skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)

-{

-  return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );

-}

-

-//

-// load tts_v as early as possible

-//

-

-static

-skc_tts_v_t

-skc_load_tts(__global skc_bp_elem_t * const bp_elems,

-             skc_block_id_s_t         const sb_id)

-{

-  return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );

-}

-

-//

-// massage ttrk keys into ttsk keys

-//

-

-static

-void

-skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)

-{

-  sk_v->lo = sk_v->lo  & SKC_TTXK_LO_MASK_ID;     // clear high (N/A) bits

-  sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits

-}

-

-//

-// replenish ttsk keys

-//

-

-static

-void

-skc_ttsk_v_replenish(skc_ttsk_v_t                * const sk_v,

-                     skc_uint                    * const sk_next,

-                     skc_uint                    * const rks_next,

-                     __global skc_ttrk_e_t const * const rks)

-{

-  // if there are still keys available then return

-  if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)

-    return;

-

-  //

-  // otherwise, replenish sk_v

-  //

-  // NOTE NOTE NOTE -- we are assuming rks[] extent size is always

-  // divisible by TTXK_V_SIZE and therefore loading some keys from the

-  // next raster is OK.

-  //

-  *sk_next   = 0;

-  *rks_next += SKC_PREFIX_SUBGROUP_SIZE;

-  *sk_v      = rks[*rks_next];

-

-#if 0

-  printf("* %08X ( %3u, %3u )\n",

-         sk_v->hi,

-         (sk_v->hi >> 12) & 0xFFF,

-         (sk_v->hi      ) & 0xFFF);

-#endif

-  

-  skc_ttrk_to_ttsk(sk_v);

-

-#if 0

-  printf("! %08X ( %3u, %3u )\n",

-         sk_v->hi,

-         (sk_v->hi >> 20) & 0xFFF,

-         (sk_v->hi >>  8) & 0xFFF);

-#endif

-}

-

-//

-// replenish block ids

-//

-// note that you can't overrun the block id pool since it's a ring

-//

-

-static

-void

-skc_blocks_replenish(skc_uint                      * const blocks_next,

-                     skc_uint                      * const blocks_idx,

-                     skc_block_id_v_t              * const blocks,

-                     skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring

-                     __global skc_block_id_t const * const bp_ids)

-

-{

-  *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;

-  *blocks      = bp_ids[*blocks_idx & bp_mask];

-  *blocks_next = 0;

-

-#if 0

-  printf("replenish blocks: %u\n",*blocks);

-#endif

-}

-

-//

-//

-//

-

-static

-skc_block_id_t

-skc_blocks_get_next(skc_uint                      * const blocks_next,

-                    skc_uint                      * const blocks_idx,

-                    skc_block_id_v_t              * const blocks,

-                    skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring

-                    __global skc_block_id_t const * const bp_ids)

-{

-  // replenish?

-  if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)

-    {

-      skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);

-    }

-

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT

-  //

-  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);

-

-#else

-  //

-  // SIMD

-  //

-  skc_block_id_t id = blocks->s0;

-

-  skc_shuffle_down_1(*blocks);

-

-#endif

-

-  *blocks_next += 1;

-

-  return id;

-}

-

-//

-// subblock allocator

-//

-

-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )

-

-static

-skc_block_id_t

-skc_subblocks_get_next_pb_id(skc_block_id_t                * const subblocks,

-                             skc_uint                      * const blocks_next,

-                             skc_uint                      * const blocks_idx,

-                             skc_block_id_v_t              * const blocks,

-                             skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring

-                             __global skc_block_id_t const * const bp_ids)

-{

-  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)

-    {

-      *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);

-    }

-

-  skc_block_id_t const pb_id = *subblocks;

-

-  *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks

-

-  return pb_id;

-}

-

-#endif

-

-//

-// append a ttsk key to the work-in-progress node

-//

-

-static

-void

-skc_node_v_append_sk(skc_ttsk_s_t            const * const sk_s,

-

-                     skc_ttxk_v_t                  * const xk_v,

-                     skc_uint                      * const xk_v_next,

-                     skc_uint                      * const xk_v_idx,

-                     __global skc_bp_elem_t        * const bp_elems,

-

-                     skc_int                         const rkpk_rem,

-

-                     skc_uint                      * const blocks_next,

-                     skc_uint                      * const blocks_idx,

-                     skc_block_id_v_t              * const blocks,

-                     skc_uint                        const bp_mask,

-                     __global skc_block_id_t const * const bp_ids)

-{

-  //

-  // Append an sk key to the in-register xk_v vector

-  //

-  // If the work-in-progress node in gmem will only have room for one

-  // more key then:

-  //

-  //   - if this was the final SK then write out xk_v and exit

-  //

-  //   - otherwise, acquire a block id, link it, write out xk_v,

-  //     prepare new node

-  //

-  // Note that this does *not* try to squeeze in a final key into the

-  // next node slot.  This optimization isn't worth the added

-  // down-pipeline complexity.

-  //

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT

-  //

-  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))

-    {

-      *xk_v = *sk_s;

-    }

-

-  *xk_v_next += 1;

-

-  // are there more keys coming?

-  if (rkpk_rem > 0)

-    {

-      // is the node almost full?

-      if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)

-        {

-          skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);

-

-          if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)

-            {

-              xk_v->lo = id;

-              xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary

-            }

-

-          // store xk_v (uint2) to bp (uint)

-          bp_elems[*xk_v_idx                         ] = xk_v->lo;

-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;

-#if 0

-          printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);

-#endif

-          // reinitialize xk_v

-          xk_v->lo = SKC_UINT_MAX;

-          xk_v->hi = SKC_UINT_MAX;

-

-          // update node elem idx

-          *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();

-

-          // reset node count

-          *xk_v_next = 0;

-        }

-      // is xk_v full?

-      else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)

-        {

-          // store xk_v to bp

-          bp_elems[*xk_v_idx                         ] = xk_v->lo;

-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;

-#if 0

-          printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);

-#endif

-          // reinitialize xk_v

-          xk_v->lo = SKC_UINT_MAX;

-          xk_v->hi = SKC_UINT_MAX;

-

-          // increment node elem idx

-          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;

-        }

-    }

-  else

-    {

-      bp_elems[*xk_v_idx                         ] = xk_v->lo;

-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;

-#if 0

-      printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);

-#endif

-      while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)

-        {

-          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;

-

-          bp_elems[*xk_v_idx]                          = SKC_UINT_MAX;

-          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;

-        }

-    }

-

-#else

-  //

-  // SIMD

-  //

-

-#endif

-}

-

-//

-//

-//

-

-static

-skc_ttpk_s_t

-skc_ttpk_create(skc_raster_yx_s const yx_prev,

-                skc_raster_yx_s const yx_next,

-                skc_block_id_t  const pb_id)

-{

-  // - yx_prev is already incremented by one 

-  // - yx_span is already shifted up at hi.x

-  skc_uint const yx_span = yx_next - yx_prev;

-

-  skc_ttpk_s_t pk;

-

-  // turn on prefix bit | shift span bits upward

-  pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);

-

-  // shift down high span bits | yx of tile

-  pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("* %08v2X : %u\n",pk,yx_span);

-#endif

-

-  return pk;

-}

-

-//

-// append a ttpk key to the work-in-progress node

-//

-

-static

-void

-skc_node_v_append_pk(skc_ttpk_s_t            const * const pk_s,

-

-                     skc_ttxk_v_t                  * const xk_v,

-                     skc_uint                      * const xk_v_next,

-                     skc_uint                      * const xk_v_idx,

-                     __global skc_bp_elem_t        * const bp_elems,

-

-                     skc_uint                      * const blocks_next,

-                     skc_uint                      * const blocks_idx,

-                     skc_block_id_v_t              * const blocks,

-                     skc_uint                        const bp_mask,

-                     __global skc_block_id_t const * const bp_ids)

-{

-  //

-  // append a pk key to the in-register xk_v vector

-  //

-  // if the work-in-progress node in gmem will only have room for one

-  // more key then:

-  //

-  //   - if this was the final SK then write out xk_v and exit

-  //

-  //   - otherwise, acquire a block id, link it, write out xk_v,

-  //     prepare new node

-  //

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT

-  //

-  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))

-    {

-      *xk_v = *pk_s;

-    }

-

-  *xk_v_next += 1;

-

-  // is the node almost full?

-  if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)

-    {

-      skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);

-

-      if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)

-        {

-          xk_v->lo = id;

-          xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary

-        }

-

-      // store xk_v to bp

-      bp_elems[*xk_v_idx                         ] = xk_v->lo;

-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;

-#if 0

-      printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);

-#endif

-      // reinitialize xk_v

-      xk_v->lo = SKC_UINT_MAX;

-      xk_v->hi = SKC_UINT_MAX;

-

-      // update node elem idx

-      *xk_v_idx  = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();

-

-      // reset node count

-      *xk_v_next = 0;

-    }

-  // is xk_v full?

-  else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)

-    {

-      // store xk_v to bp

-      bp_elems[*xk_v_idx                         ] = xk_v->lo;

-      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;

-#if 0

-      printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);

-#endif

-      // reinitialize xk_v

-      xk_v->lo = SKC_UINT_MAX;

-      xk_v->hi = SKC_UINT_MAX;

-      

-      // increment node elem idx

-      *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;

-    }

-

-#else

-  //

-  // SIMD

-  //

-#endif

-}

-

-//

-// append the first 3 fields of meta info to the raster header

-//

-

-static

-void

-skc_node_v_init_header(skc_ttxk_v_t                           * const xk_v,

-                       skc_uint                               * const xk_v_next,

-                       union skc_raster_cohort_meta_out const * const meta)

-{

-#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT

-  //

-  if (get_sub_group_local_id() < 2)

-    {

-      *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;

-    }

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("header: %08v4X\n",meta->u32v4);

-#endif

-

-  //

-  // increment counter: uint4 + uint4 = uint2 x 4

-  //

-  *xk_v_next = 2 + 2; // +2 for unitialized bounds

-

-#else

-  //

-  // SIMD

-  //

-

-#endif

-}

-

-//

-//

-//

-

-__kernel

-SKC_PREFIX_KERNEL_ATTRIBS

-void

-skc_kernel_prefix(__global skc_uint       const * const bp_atomics,

-                  __global skc_block_id_t const * const bp_ids,

-                  __global skc_bp_elem_t        * const bp_elems,

-                  skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring

-                  __global skc_ttrk_e_t   const * const rks,

-                  __global skc_block_id_t       * const map,

-                  __global skc_uint       const * const metas,

-                  skc_uint                        const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )

-  __local struct skc_subgroup_smem                  smem[1];

-#else

-  __local struct skc_subgroup_smem                  smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];

-  __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();

-#endif

-

-  //

-  // where is this subgroup in the grid?

-  //

-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )

-  skc_uint const sgi = get_group_id(0);

-#else

-  skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  skc_uint const sgl = get_sub_group_local_id();

-

-  //

-  // return if this subgroup is excess

-  //

-#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )

-  if (sgi >= count)

-    return;

-#endif

-

-  //

-  // get meta info for this subgroup's raster

-  //

-  union skc_raster_cohort_meta_out const meta  = { vload4(sgi,metas) };

-  skc_uint                         const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("%3u : %5u / %5u / %5u / %5u / %u\n",

-           sgi,

-           meta.blocks,

-           meta.offset,

-           meta.nodes,

-           meta.keys,

-           reads);

-#endif

-

-  //

-  // preload blocks -- align on subgroup

-  //

-  skc_uint         blocks_idx  = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();

-  skc_block_id_v_t blocks      = bp_ids[blocks_idx & bp_mask];

-  skc_uint         blocks_next = (reads &  SKC_PREFIX_SUBGROUP_MASK);

-

-  //

-  // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset

-  //

-  skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();

-

-  //

-  // initialize raster header -- assumes block is greater than 8 words (4 doublewords)

-  //

-  skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };

-  skc_uint     xk_v_next;

-

-  skc_node_v_init_header(&xk_v,&xk_v_next,&meta);

-

-  //

-  // no keys -- this is an empty raster!

-  //

-  if (meta.keys == 0)

-    {

-      bp_elems[xk_v_idx                         ] = xk_v.lo;

-      bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;

-

-      while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)

-        {

-          xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;

-

-          bp_elems[xk_v_idx]                          = SKC_UINT_MAX;

-          bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;

-        }

-

-      return;

-    }

-

-  //

-  // load TTRK keys and in-place convert to TTSK keys

-  //

-  skc_uint         rks_next    = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();

-  skc_ttsk_v_t     sk_v        = rks[rks_next];

-  skc_uint         sk_next     = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);

-  skc_int          rkpk_rem    = meta.keys; // signed count of remaining rk+pk keys

-

-#if 0

-  printf("* %08X ( %3u, %3u )\n",

-         sk_v.hi,

-         (sk_v.hi >> 12) & 0xFFF,

-         (sk_v.hi      ) & 0xFFF);

-#endif

-  

-  skc_ttrk_to_ttsk(&sk_v);

-

-#if 0

-  printf("! %08X ( %3u, %3u )\n",

-         sk_v.hi,

-         (sk_v.hi >> 20) & 0xFFF,

-         (sk_v.hi >>  8) & 0xFFF);

-#endif

-

-  //

-  // subblocks

-  //

-#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )

-  skc_block_id_t subblocks = 0;

-#endif

-

-  //

-  // begin "scan" of tiles

-  //

-  skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);

-

-  //

-  // zero the accumulator

-  //

-  skc_accum_reset(smem);

-

-  while (true)

-    {

-      // get next rk key

-      skc_ttsk_s_t     const sk_s  = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);

-

-      // load ttsb id

-      skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);

-

-      // load tts_v transaction "in flight" as early as possible

-      skc_tts_v_t      const tts_v = skc_load_tts(bp_elems,sb_id);

-

-#if 0

-      printf("{ %08X }\n",tts_v);

-#endif

-

-#if 0

-      if (get_sub_group_local_id() == 0)

-        printf("[ %d, %X ]\n",rkpk_rem,sb_id);

-#endif

-

-#if 0

-      if (get_sub_group_local_id() == 0)

-        printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);

-#endif

-

-      //

-      // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF

-      // TIME AND SIMD'IZED

-      //

-

-      // if yx's don't match then we're either issuing a ttpk or

-      // resetting the accumulator

-      if (sk_s.hi != yx_prev)

-        {

-          // if yx_next.y == yx_last.y then x changed

-          if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)

-            {

-              //

-              // if the tile is not square then it's ratio is 1:2

-              //

-#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2

-              skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,

-                                                                        &blocks_next,

-                                                                        &blocks_idx,

-                                                                        &blocks,

-                                                                        bp_mask,

-                                                                        bp_ids);

-#else

-              skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,

-                                                               &blocks_idx,

-                                                               &blocks,

-                                                               bp_mask,

-                                                               bp_ids);

-#endif

-

-              // flush accumulated ttp vector to block/subblock at ttpb_id

-              skc_accum_flush(smem,bp_elems,pb_id);

-

-#if 0

-              if (get_sub_group_local_id() == 0)

-                {

-                  printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",

-                         pb_id,

-                         (yx_prev >> SKC_TTXK_HI_OFFSET_Y),

-                         (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,

-                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,

-                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);

-                }

-#endif

-

-              //

-              // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP

-              //

-              rkpk_rem -= 1;

-

-              // create the pk

-              skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);

-

-              // append pk key to xk buffer

-              skc_node_v_append_pk(&pk_s,

-

-                                   &xk_v,

-                                   &xk_v_next,

-                                   &xk_v_idx,

-                                   bp_elems,

-

-                                   &blocks_next,

-                                   &blocks_idx,

-                                   &blocks,

-                                   bp_mask,

-                                   bp_ids);

-            }

-          else if (rkpk_rem > 0) // we're starting a new tile row

-            {

-              skc_accum_reset(smem);

-            }

-        }

-

-      //

-      // append sk key to node_v

-      //

-      // if rkpk_rem is zero then return from kernel

-      //

-      skc_node_v_append_sk(&sk_s,

-

-                           &xk_v,

-                           &xk_v_next,

-                           &xk_v_idx,

-                           bp_elems,

-

-                           rkpk_rem,

-

-                           &blocks_next,

-                           &blocks_idx,

-                           &blocks,

-                           bp_mask,

-                           bp_ids);

-

-      // we're done if no more sk keys

-      if (rkpk_rem == 0)

-        break;

-

-      // move to new tile

-      yx_prev = sk_s.hi;

-

-      // scatter tts values into accumulator

-      skc_accum_scatter(smem,tts_v);

-

-      // replenish sk keys

-      skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+// INPUT:
+//
+//   TTRK (64-BIT COMPARE)
+//
+//    0                                  63
+//    | TTSB ID |   X  |   Y  | COHORT ID |
+//    +---------+------+------+-----------+
+//    |    27   |  12  |  12  |     13    |
+//
+//
+//   TTRK (32-BIT COMPARE)
+//
+//    0                                        63
+//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
+//    +---------+-----+------+------+-----------+
+//    |    27   |  5  |  12  |  12  |     8     |
+//
+//
+// OUTPUT:
+//
+//   TTSK v2:
+//
+//    0                                     63
+//    | TTSB ID | PREFIX |  N/A |  X |  Y |
+//    +---------+--------+------+----+----+
+//    |    27   | 1 (=0) |  12  | 12 | 12 |
+//
+//
+//   TTPK v1:
+//
+//    0                                        63
+//    | TTPB ID | ALL ZEROES | SPAN |  X  |  Y  |
+//    +---------+------------+------+-----+-----+
+//    |    27   |      1     |  12  | 12  | 12  |
+//
+//
+//   TTPK v2:
+//
+//    0                                       63
+//    | TTPB ID | PREFIX | SPAN |  X  |  Y  |
+//    +---------+--------+------+-----+-----+
+//    |    27   | 1 (=1) |  12  | 12  | 12  |
+//
+
+#define SKC_PREFIX_SUBGROUP_MASK  (SKC_PREFIX_SUBGROUP_SIZE - 1)
+
+//
+// smem accumulator
+//
+
+union skc_subgroup_accum
+{
+  struct {
+    SKC_ATOMIC_INT        ttp[SKC_TILE_HEIGHT];
+  } atomic;
+
+  struct {
+    skc_ttp_t             ttp[SKC_TILE_HEIGHT];
+  } aN;
+
+  struct {
+    SKC_PREFIX_TTP_V      ttp[SKC_PREFIX_SUBGROUP_SIZE];
+  } vN;
+
+  struct {
+    SKC_PREFIX_SMEM_ZERO  ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
+  } zero;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+  // prefix accumulator
+  union skc_subgroup_accum accum;
+};
+
+//
+//
+//
+
+static
+skc_uint
+skc_subgroup_lane()
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  return get_sub_group_local_id();
+#else
+  return 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_dy(skc_tts_v_t const ttsv)
+{
+  // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
+  SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
+
+  return dy - (~ttsv >> 31);
+}
+
+static
+SKC_PREFIX_TTS_V_BITFIELD
+skc_tts_get_py(skc_tts_v_t const ttsv)
+{
+  return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
+{
+  // get "altitude"
+  SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
+
+  // get the y pixel coordinate
+  SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
+
+  //
+  // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
+  //
+  // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
+  //
+
+#if 0
+  if (tts_v != SKC_TTS_INVALID)
+    printf("< %08X = %u : %d >\n",tts_v,py,dy); 
+#endif
+
+  //
+  // scatter-add the "altitude" to accumulator
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  if (tts_v C != SKC_TTS_INVALID) {                                     \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
+  }
+
+#else
+  //
+  // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
+  //
+  // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                 \
+  if (tts_v C == SKC_TTS_INVALID)               \
+    return;                                     \
+  smem->accum.aN.ttp[py C] = dy C;
+#endif
+
+  SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
+}
+
+//
+// The implication here is that if our device configuration has a
+// rectangular 1:2 tile then we need a block size of at least 2
+// subblocks. The subblock size of course needs to match the length of
+// the smallest tile side.
+//
+
+static
+void
+skc_accum_flush(__local struct skc_subgroup_smem * const smem,
+                __global skc_bp_elem_t           * const bp_elems,
+                skc_block_id_t                     const pb_id)
+{
+  // load the ttp elements
+  SKC_PREFIX_TTP_V const ttp_v  = smem->accum.vN.ttp[get_sub_group_local_id()];
+  skc_uint         const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+  
+#if   ( SKC_TILE_RATIO == 1 )
+
+  bp_elems[offset] = ttp_v;
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+  vstore2(ttp_v,offset,bp_elems);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_accum_reset(__local struct skc_subgroup_smem * const smem)
+{
+  for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
+    smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// get next sk key
+//
+
+static
+skc_ttsk_s_t
+skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
+                    skc_uint     * const sk_next,
+                    skc_int      * const rkpk_rem)
+{
+  // decrement count
+  *rkpk_rem -= 1;
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT with subgroup support is easy
+  //
+  // SIMT without subgroup support can always emulate with smem
+  //
+#if 0
+  //
+  // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
+  // broadcast a uint2 cast to a long. It was probably bad to do this
+  // anyway without a union wrapping the TTSK scalar type.
+  //
+  // Consider creating a union { ulong; uint2 } at a later date --
+  // probably no need to ever do this unless it makes broadcast faster
+  // which is unlikely since it will probably be implemented as 2
+  // 32-bit broadcasts.
+  //
+  // Additionally, the TTRK and TTXK key bitfield sizes are probably
+  // cast in stone and we aren't going to change them no matter
+  // architecture we're on.
+  //
+  skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
+#else
+  skc_ttsk_s_t sk_s;
+
+  sk_s.lo   = sub_group_broadcast(sk_v->lo,*sk_next);
+  sk_s.hi   = sub_group_broadcast(sk_v->hi,*sk_next);
+  *sk_next += 1;
+#endif
+
+#else
+  //
+  // SIMD will always grab component .s0 and then rotate the vector
+  //
+  sk_s = ( sk_v->s0 );
+
+  skc_ttsk_v_rotate_down(sk_v);
+
+#endif
+
+  return sk_s;
+}
+
+//
+//
+//
+
+static
+skc_raster_yx_s
+skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT with subgroup support is easy
+  //
+  // SIMT without subgroup support can always emulate with smem
+  //
+  skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
+
+#else
+  //
+  // SIMD will always grab component .s0 and then rotate the vector
+  //
+  skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
+
+#endif
+
+  return yx_s;
+}
+
+//
+// mask off ttsb id
+//
+
+static
+skc_block_id_s_t
+skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
+{
+  return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
+}
+
+//
+// load tts_v as early as possible
+//
+
+static
+skc_tts_v_t
+skc_load_tts(__global skc_bp_elem_t * const bp_elems,
+             skc_block_id_s_t         const sb_id)
+{
+  return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
+}
+
+//
+// massage ttrk keys into ttsk keys
+//
+
+static
+void
+skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
+{
+  sk_v->lo = sk_v->lo  & SKC_TTXK_LO_MASK_ID;     // clear high (N/A) bits
+  sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
+}
+
+//
+// replenish ttsk keys
+//
+
+static
+void
+skc_ttsk_v_replenish(skc_ttsk_v_t                * const sk_v,
+                     skc_uint                    * const sk_next,
+                     skc_uint                    * const rks_next,
+                     __global skc_ttrk_e_t const * const rks)
+{
+  // if there are still keys available then return
+  if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
+    return;
+
+  //
+  // otherwise, replenish sk_v
+  //
+  // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
+  // divisible by TTXK_V_SIZE and therefore loading some keys from the
+  // next raster is OK.
+  //
+  *sk_next   = 0;
+  *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
+  *sk_v      = rks[*rks_next];
+
+#if 0
+  printf("* %08X ( %3u, %3u )\n",
+         sk_v->hi,
+         (sk_v->hi >> 12) & 0xFFF,
+         (sk_v->hi      ) & 0xFFF);
+#endif
+  
+  skc_ttrk_to_ttsk(sk_v);
+
+#if 0
+  printf("! %08X ( %3u, %3u )\n",
+         sk_v->hi,
+         (sk_v->hi >> 20) & 0xFFF,
+         (sk_v->hi >>  8) & 0xFFF);
+#endif
+}
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                     __global skc_block_id_t const * const bp_ids)
+
+{
+  *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
+  *blocks      = bp_ids[*blocks_idx & bp_mask];
+  *blocks_next = 0;
+
+#if 0
+  printf("replenish blocks: %u\n",*blocks);
+#endif
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint                      * const blocks_next,
+                    skc_uint                      * const blocks_idx,
+                    skc_block_id_v_t              * const blocks,
+                    skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                    __global skc_block_id_t const * const bp_ids)
+{
+  // replenish?
+  if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
+    {
+      skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+    }
+
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+  //
+  // SIMD
+  //
+  skc_block_id_t id = blocks->s0;
+
+  skc_shuffle_down_1(*blocks);
+
+#endif
+
+  *blocks_next += 1;
+
+  return id;
+}
+
+//
+// subblock allocator
+//
+
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+
+static
+skc_block_id_t
+skc_subblocks_get_next_pb_id(skc_block_id_t                * const subblocks,
+                             skc_uint                      * const blocks_next,
+                             skc_uint                      * const blocks_idx,
+                             skc_block_id_v_t              * const blocks,
+                             skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                             __global skc_block_id_t const * const bp_ids)
+{
+  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+    }
+
+  skc_block_id_t const pb_id = *subblocks;
+
+  *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
+
+  return pb_id;
+}
+
+#endif
+
+//
+// append a ttsk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_sk(skc_ttsk_s_t            const * const sk_s,
+
+                     skc_ttxk_v_t                  * const xk_v,
+                     skc_uint                      * const xk_v_next,
+                     skc_uint                      * const xk_v_idx,
+                     __global skc_bp_elem_t        * const bp_elems,
+
+                     skc_int                         const rkpk_rem,
+
+                     skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask,
+                     __global skc_block_id_t const * const bp_ids)
+{
+  //
+  // Append an sk key to the in-register xk_v vector
+  //
+  // If the work-in-progress node in gmem will only have room for one
+  // more key then:
+  //
+  //   - if this was the final SK then write out xk_v and exit
+  //
+  //   - otherwise, acquire a block id, link it, write out xk_v,
+  //     prepare new node
+  //
+  // Note that this does *not* try to squeeze in a final key into the
+  // next node slot.  This optimization isn't worth the added
+  // down-pipeline complexity.
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+    {
+      *xk_v = *sk_s;
+    }
+
+  *xk_v_next += 1;
+
+  // are there more keys coming?
+  if (rkpk_rem > 0)
+    {
+      // is the node almost full?
+      if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+        {
+          skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+          if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+            {
+              xk_v->lo = id;
+              xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+            }
+
+          // store xk_v (uint2) to bp (uint)
+          bp_elems[*xk_v_idx                         ] = xk_v->lo;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+          printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+          // reinitialize xk_v
+          xk_v->lo = SKC_UINT_MAX;
+          xk_v->hi = SKC_UINT_MAX;
+
+          // update node elem idx
+          *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+          // reset node count
+          *xk_v_next = 0;
+        }
+      // is xk_v full?
+      else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+        {
+          // store xk_v to bp
+          bp_elems[*xk_v_idx                         ] = xk_v->lo;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+          printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+          // reinitialize xk_v
+          xk_v->lo = SKC_UINT_MAX;
+          xk_v->hi = SKC_UINT_MAX;
+
+          // increment node elem idx
+          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+        }
+    }
+  else
+    {
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+        {
+          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+          bp_elems[*xk_v_idx]                          = SKC_UINT_MAX;
+          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+        }
+    }
+
+#else
+  //
+  // SIMD
+  //
+
+#endif
+}
+
+//
+//
+//
+
+static
+skc_ttpk_s_t
+skc_ttpk_create(skc_raster_yx_s const yx_prev,
+                skc_raster_yx_s const yx_next,
+                skc_block_id_t  const pb_id)
+{
+  // - yx_prev is already incremented by one 
+  // - yx_span is already shifted up at hi.x
+  skc_uint const yx_span = yx_next - yx_prev;
+
+  skc_ttpk_s_t pk;
+
+  // turn on prefix bit | shift span bits upward
+  pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
+
+  // shift down high span bits | yx of tile
+  pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("* %08v2X : %u\n",pk,yx_span);
+#endif
+
+  return pk;
+}
+
+//
+// append a ttpk key to the work-in-progress node
+//
+
+static
+void
+skc_node_v_append_pk(skc_ttpk_s_t            const * const pk_s,
+
+                     skc_ttxk_v_t                  * const xk_v,
+                     skc_uint                      * const xk_v_next,
+                     skc_uint                      * const xk_v_idx,
+                     __global skc_bp_elem_t        * const bp_elems,
+
+                     skc_uint                      * const blocks_next,
+                     skc_uint                      * const blocks_idx,
+                     skc_block_id_v_t              * const blocks,
+                     skc_uint                        const bp_mask,
+                     __global skc_block_id_t const * const bp_ids)
+{
+  //
+  // append a pk key to the in-register xk_v vector
+  //
+  // if the work-in-progress node in gmem will only have room for one
+  // more key then:
+  //
+  //   - if this was the final SK then write out xk_v and exit
+  //
+  //   - otherwise, acquire a block id, link it, write out xk_v,
+  //     prepare new node
+  //
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
+    {
+      *xk_v = *pk_s;
+    }
+
+  *xk_v_next += 1;
+
+  // is the node almost full?
+  if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
+    {
+      skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
+
+      if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
+        {
+          xk_v->lo = id;
+          xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
+        }
+
+      // store xk_v to bp
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      // reinitialize xk_v
+      xk_v->lo = SKC_UINT_MAX;
+      xk_v->hi = SKC_UINT_MAX;
+
+      // update node elem idx
+      *xk_v_idx  = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+      // reset node count
+      *xk_v_next = 0;
+    }
+  // is xk_v full?
+  else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
+    {
+      // store xk_v to bp
+      bp_elems[*xk_v_idx                         ] = xk_v->lo;
+      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
+#if 0
+      printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
+#endif
+      // reinitialize xk_v
+      xk_v->lo = SKC_UINT_MAX;
+      xk_v->hi = SKC_UINT_MAX;
+      
+      // increment node elem idx
+      *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+    }
+
+#else
+  //
+  // SIMD
+  //
+#endif
+}
+
+//
+// append the first 3 fields of meta info to the raster header
+//
+
+static
+void
+skc_node_v_init_header(skc_ttxk_v_t                           * const xk_v,
+                       skc_uint                               * const xk_v_next,
+                       union skc_raster_cohort_meta_out const * const meta)
+{
+#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  if (get_sub_group_local_id() < 2)
+    {
+      *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
+    }
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("header: %08v4X\n",meta->u32v4);
+#endif
+
+  //
+  // increment counter: uint4 + uint4 = uint2 x 4
+  //
+  *xk_v_next = 2 + 2; // +2 for unitialized bounds
+
+#else
+  //
+  // SIMD
+  //
+
+#endif
+}
+
+//
+//
+//
+
+__kernel
+SKC_PREFIX_KERNEL_ATTRIBS
+void
+skc_kernel_prefix(__global skc_uint       const * const bp_atomics,
+                  __global skc_block_id_t const * const bp_ids,
+                  __global skc_bp_elem_t        * const bp_elems,
+                  skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
+                  __global skc_ttrk_e_t   const * const rks,
+                  __global skc_block_id_t       * const map,
+                  __global skc_uint       const * const metas,
+                  skc_uint                        const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem                  smem[1];
+#else
+  __local struct skc_subgroup_smem                  smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
+#endif
+
+  //
+  // where is this subgroup in the grid?
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const sgi = get_group_id(0);
+#else
+  skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  skc_uint const sgl = get_sub_group_local_id();
+
+  //
+  // return if this subgroup is excess
+  //
+#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
+  if (sgi >= count)
+    return;
+#endif
+
+  //
+  // get meta info for this subgroup's raster
+  //
+  union skc_raster_cohort_meta_out const meta  = { vload4(sgi,metas) };
+  skc_uint                         const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("%3u : %5u / %5u / %5u / %5u / %u\n",
+           sgi,
+           meta.blocks,
+           meta.offset,
+           meta.nodes,
+           meta.keys,
+           reads);
+#endif
+
+  //
+  // preload blocks -- align on subgroup
+  //
+  skc_uint         blocks_idx  = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+  skc_block_id_v_t blocks      = bp_ids[blocks_idx & bp_mask];
+  skc_uint         blocks_next = (reads &  SKC_PREFIX_SUBGROUP_MASK);
+
+  //
+  // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
+  //
+  skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+  //
+  // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
+  //
+  skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
+  skc_uint     xk_v_next;
+
+  skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
+
+  //
+  // no keys -- this is an empty raster!
+  //
+  if (meta.keys == 0)
+    {
+      bp_elems[xk_v_idx                         ] = xk_v.lo;
+      bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
+
+      while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
+        {
+          xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
+
+          bp_elems[xk_v_idx]                          = SKC_UINT_MAX;
+          bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
+        }
+
+      return;
+    }
+
+  //
+  // load TTRK keys and in-place convert to TTSK keys
+  //
+  skc_uint         rks_next    = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
+  skc_ttsk_v_t     sk_v        = rks[rks_next];
+  skc_uint         sk_next     = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
+  skc_int          rkpk_rem    = meta.keys; // signed count of remaining rk+pk keys
+
+#if 0
+  printf("* %08X ( %3u, %3u )\n",
+         sk_v.hi,
+         (sk_v.hi >> 12) & 0xFFF,
+         (sk_v.hi      ) & 0xFFF);
+#endif
+  
+  skc_ttrk_to_ttsk(&sk_v);
+
+#if 0
+  printf("! %08X ( %3u, %3u )\n",
+         sk_v.hi,
+         (sk_v.hi >> 20) & 0xFFF,
+         (sk_v.hi >>  8) & 0xFFF);
+#endif
+
+  //
+  // subblocks
+  //
+#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
+  skc_block_id_t subblocks = 0;
+#endif
+
+  //
+  // begin "scan" of tiles
+  //
+  skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
+
+  //
+  // zero the accumulator
+  //
+  skc_accum_reset(smem);
+
+  while (true)
+    {
+      // get next rk key
+      skc_ttsk_s_t     const sk_s  = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
+
+      // load ttsb id
+      skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
+
+      // load tts_v transaction "in flight" as early as possible
+      skc_tts_v_t      const tts_v = skc_load_tts(bp_elems,sb_id);
+
+#if 0
+      printf("{ %08X }\n",tts_v);
+#endif
+
+#if 0
+      if (get_sub_group_local_id() == 0)
+        printf("[ %d, %X ]\n",rkpk_rem,sb_id);
+#endif
+
+#if 0
+      if (get_sub_group_local_id() == 0)
+        printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
+#endif
+
+      //
+      // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
+      // TIME AND SIMD'IZED
+      //
+
+      // if yx's don't match then we're either issuing a ttpk or
+      // resetting the accumulator
+      if (sk_s.hi != yx_prev)
+        {
+          // if yx_next.y == yx_last.y then x changed
+          if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
+            {
+              //
+              // if the tile is not square then it's ratio is 1:2
+              //
+#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
+              skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
+                                                                        &blocks_next,
+                                                                        &blocks_idx,
+                                                                        &blocks,
+                                                                        bp_mask,
+                                                                        bp_ids);
+#else
+              skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
+                                                               &blocks_idx,
+                                                               &blocks,
+                                                               bp_mask,
+                                                               bp_ids);
+#endif
+
+              // flush accumulated ttp vector to block/subblock at ttpb_id
+              skc_accum_flush(smem,bp_elems,pb_id);
+
+#if 0
+              if (get_sub_group_local_id() == 0)
+                {
+                  printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
+                         pb_id,
+                         (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
+                         (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
+                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
+                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
+                }
+#endif
+
+              //
+              // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
+              //
+              rkpk_rem -= 1;
+
+              // create the pk
+              skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
+
+              // append pk key to xk buffer
+              skc_node_v_append_pk(&pk_s,
+
+                                   &xk_v,
+                                   &xk_v_next,
+                                   &xk_v_idx,
+                                   bp_elems,
+
+                                   &blocks_next,
+                                   &blocks_idx,
+                                   &blocks,
+                                   bp_mask,
+                                   bp_ids);
+            }
+          else if (rkpk_rem > 0) // we're starting a new tile row
+            {
+              skc_accum_reset(smem);
+            }
+        }
+
+      //
+      // append sk key to node_v
+      //
+      // if rkpk_rem is zero then return from kernel
+      //
+      skc_node_v_append_sk(&sk_s,
+
+                           &xk_v,
+                           &xk_v_next,
+                           &xk_v_idx,
+                           bp_elems,
+
+                           rkpk_rem,
+
+                           &blocks_next,
+                           &blocks_idx,
+                           &blocks,
+                           bp_mask,
+                           bp_ids);
+
+      // we're done if no more sk keys
+      if (rkpk_rem == 0)
+        break;
+
+      // move to new tile
+      yx_prev = sk_s.hi;
+
+      // scatter tts values into accumulator
+      skc_accum_scatter(smem,tts_v);
+
+      // replenish sk keys
+      skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
index e622845..f20f645 100644
--- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
@@ -1,3366 +1,3366 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "common.h"

-#include "atomic_cl.h"

-#include "block_pool_cl.h"

-#include "raster_builder_cl_12.h"

-#include "device_cl_12.h"

-

-// #define SKC_ARCH_AVX2

-// #define SKC_RASTERIZE_SIMD_USES_SMEM

-

-#define PRINTF_ENABLE       0

-#define PRINTF_BLOCK_COUNT  0

-

-//

-// NOTE:

-//

-// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT

-// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE

-//

-// NOTE:

-//

-// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.

-//

-//

-

-#if 0 // SKC_ARCH_AVX2

-

-// #define SKC_RASTERIZE_SUBGROUP_SIZE              1

-// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3

-// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1

-

-// #define SKC_TTXB_WORDS                           8

-

-// #define SKC_RASTERIZE_FLOAT                      float8

-// #define SKC_RASTERIZE_UINT                       uint8

-// #define SKC_RASTERIZE_INT                        int8

-// #define SKC_RASTERIZE_PREDICATE                  int8

-

-// #define SKC_RASTERIZE_BIN_BLOCK                  uint16

-// #define SKC_RASTERIZE_BIN                        uint8

-

-// #define SKC_RASTERIZE_POOL                       uint8

-// #define SKC_RASTERIZE_POOL_SCALE                 6

-

-// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1

-// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2

-

-// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()

-

-#endif

-

-//

-// SIMT

-//

-

-#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE

-#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE

-#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)

-

-//

-//

-//

-

-#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)

-#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)

-

-//

-//

-//

-

-#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }

-#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }

-

-//

-//

-//

-

-#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)

-#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)

-#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)

-#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)

-#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)

-#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)

-

-//

-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"

-//

-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/

-//

-// Lerp in two fma/mad ops:

-//

-//    t * b + ((-t) * a + a)

-//

-// Note: OpenCL documents mix() as being implemented as:

-//

-//    a + (b - a) * t

-//

-// But this may be a native instruction on some devices. For example,

-// on GEN9 there is an LRP "linear interoplation" opcode but it

-// doesn't appear to support half floats.

-//

-// Feel free to toggle this option and then benchmark and inspect the

-// generated code.  We really want the double FMA to be generated when

-// there isn't support for a LERP/MIX operation.

-//

-

-#if 1

-#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))

-#else

-#define SKC_LERP(a,b,t)      mix(a,b,t)

-#endif

-

-//

-// There is no integer MAD in OpenCL with "don't care" overflow

-// semantics.

-//

-// FIXME -- verify if the platform needs explicit MAD operations even

-// if a "--fastmath" option is available at compile time.  It might

-// make sense to explicitly use MAD calls if the platform requires it.

-//

-

-#if 1

-#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))

-#else

-#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)

-#endif

-

-//

-//

-//

-

-#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())

-

-//

-//

-//

-

-union skc_bp_elem

-{

-  skc_uint              u32;

-  skc_tagged_block_id_t tag_id;

-  skc_float             coord;

-};

-

-//

-//

-//

-

-struct skc_subgroup_smem

-{

-  //

-  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member

-  //

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )

-  struct {

-    union {

-

-      skc_uint                winner;

-

-      struct {

-        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];

-      } aN;

-

-      struct {

-        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];

-      } vN;

-    };

-  } subgroup;

-#endif

-

-  //

-  // work-in-progress TTSB blocks and associated YX keys

-  //

-  union {

-    struct {

-      // FIXME -- some typedefs are valid here

-      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];

-      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];

-      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];

-      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];

-    } aN;

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-    struct {

-      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];

-      SKC_RASTERIZE_BIN       yx;

-      SKC_RASTERIZE_BIN       id;

-      SKC_RASTERIZE_BIN       count;

-    } vN;

-#endif

-  } bin;

-};

-

-//

-//

-//

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-#define skc_subgroup_lane()  0

-#else

-#define skc_subgroup_lane()  get_sub_group_local_id()

-#endif

-

-//

-// replenish block ids

-//

-// note that you can't overrun the block id pool since it's a ring

-//

-

-static

-void

-skc_blocks_replenish(skc_uint                           * const blocks_next,

-                     skc_block_id_v_t                   * const blocks,

-                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,

-                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring

-                     __global skc_block_id_t   const    * const bp_ids)

-{

-  //

-  // get a new vector of block ids -- this is kind of a narrow

-  // allocation but subblocks help stretch out the pool.

-  //

-  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids

-  //

-  skc_uint bp_idx = 0;

-

-  if (skc_subgroup_lane() == 0)

-    {

-      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,

-                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads

-#if 0

-      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);

-#endif

-    }

-

-  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;

-  *blocks      = bp_ids[bp_idx];

-  *blocks_next = 0;

-}

-

-//

-//

-//

-

-static

-skc_block_id_t

-skc_blocks_get_next(skc_uint                           * const blocks_next,

-                    skc_block_id_v_t                   * const blocks,

-                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,

-                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring

-                    __global skc_block_id_t   const    * const bp_ids)

-{

-  // replenish?

-  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)

-    {

-      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);

-    }

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )

-  //

-  // SIMT

-  //

-  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);

-

-#else

-  //

-  // SIMD

-  //

-  skc_block_id_t id = blocks->s0;

-

-  skc_shuffle_down_1(*blocks);

-

-#endif

-

-  *blocks_next += 1;

-

-  return id;

-}

-

-//

-// subblock allocator

-//

-

-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

-

-static

-skc_block_id_t

-skc_subblocks_get_next(skc_block_id_t                     * const subblocks,

-                       skc_uint                           * const blocks_next,

-                       skc_block_id_v_t                   * const blocks,

-                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,

-                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring

-                       __global skc_block_id_t   const    * const bp_ids)

-{

-  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)

-    {

-      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);

-    }

-

-  skc_block_id_t const sb_id = *subblocks;

-

-  *subblocks += 1;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("= %u\n",sb_id);

-#endif

-

-  return sb_id;

-}

-

-

-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks

-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks

-

-#else

-

-#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks

-#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks

-

-#endif

-

-//

-//

-//

-

-static

-skc_block_id_t

-skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),

-                  skc_uint                           * const blocks_next,

-                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,

-                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring

-                  __global skc_block_id_t   const    * const bp_ids,

-                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,

-                  skc_ttsk_v_t                       * const sk_v,

-                  skc_uint                           * const sk_v_next,

-                  __global skc_ttsk_s_t              * const sk_extent,

-                  skc_uint                             const new_yx)

-{

-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

-  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,

-                                                       blocks_next,

-                                                       blocks,

-                                                       bp_atomics,

-                                                       bp_mask,

-                                                       bp_ids);

-#else

-  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,

-                                                    blocks,

-                                                    bp_atomics,

-                                                    bp_mask, // pow2 modulo mask for block pool ring

-                                                    bp_ids);

-#endif

-

-  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))

-    {

-      sk_v->lo = new_id;

-      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;

-#if 0

-      printf("@ ( %3u, %3u ) %u\n",

-             (new_yx >> 12) & 0xFFF,

-             (new_yx      ) & 0xFFF,

-             new_id);

-#endif

-    }

-

-  *sk_v_next += 1;

-

-  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)

-    {

-      *sk_v_next = 0;

-

-      skc_uint sk_idx = 0;

-

-      if (skc_subgroup_lane() == 0)

-        {

-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE

-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);

-#if 0

-          printf("+ %u\n",sk_idx);

-#endif

-        }

-

-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )

-      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)

-#endif

-        {

-          sk_extent[sk_idx] = *sk_v;

-#if 0

-          printf("> %u : %v2u\n",sk_idx,*sk_v);

-#endif

-        }

-    }

-

-  return new_id;

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  // Note that there isn't a built-in horizontal scan for vectors so

-  // we'll define some here for various widths.

-  //

-  // FIXME -- a scalar version might be faster so put in a

-  // compile-time switch to selection between implementations

-  //

-

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  // 01

-  //  0 +

-  // --

-  // 01

-  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);

-  return w;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  // 0123

-  //  012 +

-  // ----

-  // 0123

-  //   01 +

-  // ----

-  // 0123

-  //

-  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);

-  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);

-  return x;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  // 01234567

-  //  0123456 +

-  // --------

-  // 01234567

-  //   012345 +

-  // --------

-  // 01234567

-  //     0123 +

-  // --------

-  // 01234567

-  //

-  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);

-  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);

-  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);

-  return y;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  // 0123456789abcdef

-  //  0123456789abcde +

-  // ----------------

-  // 0123456789abcdef

-  //   0123456789abcd +

-  // ----------------

-  // 0123456789abcdef

-  //     0123456789ab +

-  // ----------------

-  // 0123456789abcdef

-  //         01234567 +

-  // ----------------

-  // 0123456789abcdef

-  //

-  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);

-  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);

-  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);

-  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);

-  return z;

-

-#endif

-

-#else

-  //

-  // SIMT

-  //

-

-  return sub_group_scan_inclusive_add(v);

-

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_UINT

-skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  // Note that there isn't a built-in horizontal scan for vectors so

-  // we'll define some here for various widths.

-  //

-  // FIXME -- a scalar version might be faster so put in a

-  // compile-time switch to selection between implementations

-  //

-

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  // 01

-  //  0 +

-  // --

-  // 01

-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);

-  return w;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  // 0123

-  //  012 +

-  // ----

-  // 0123

-  //   01 +

-  // ----

-  // 0123

-  //

-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);

-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);

-  return x;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  // 01234567

-  //  0123456 +

-  // --------

-  // 01234567

-  //   012345 +

-  // --------

-  // 01234567

-  //     0123 +

-  // --------

-  // 01234567

-  //

-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);

-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);

-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);

-  return y;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  // 0123456789abcdef

-  //  0123456789abcde +

-  // ----------------

-  // 0123456789abcdef

-  //   0123456789abcd +

-  // ----------------

-  // 0123456789abcdef

-  //     0123456789ab +

-  // ----------------

-  // 0123456789abcdef

-  //         01234567 +

-  // ----------------

-  // 0123456789abcdef

-  //

-  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);

-  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);

-  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);

-  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);

-  return z;

-

-#endif

-

-#else

-  //

-  // SIMT

-  //

-

-  return sub_group_scan_inclusive_add(v);

-

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_UINT

-skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  // Note that there isn't a built-in horizontal scan for vectors so

-  // we'll define some here for various widths.

-  //

-  // FIXME -- a scalar version might be faster so put in a

-  // compile-time switch to selection between implementations

-  //

-

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  // 01

-  // 00 max

-  // --

-  // 01

-  SKC_RASTERIZE_UINT const w = max(v.s00,v);

-  return w;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  // 0123

-  // 0012 +

-  // ----

-  // 0123

-  // 0101 +

-  // ----

-  // 0123

-  //

-  SKC_RASTERIZE_UINT const w = max(v.s0012,v);

-  SKC_RASTERIZE_UINT const x = max(w.s0101,w);

-  return x;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  // 01234567

-  // 00123456 +

-  // --------

-  // 01234567

-  // 01012345 +

-  // --------

-  // 01234567

-  // 01230123 +

-  // --------

-  // 01234567

-  //

-  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);

-  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);

-  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);

-  return y;

-

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  // 0123456789abcdef

-  // 00123456789abcde +

-  // ----------------

-  // 0123456789abcdef

-  // 010123456789abcd +

-  // ----------------

-  // 0123456789abcdef

-  // 01230123456789ab +

-  // ----------------

-  // 0123456789abcdef

-  // 0123456701234567 +

-  // ----------------

-  // 0123456789abcdef

-  //

-  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);

-  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);

-  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);

-  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);

-  return z;

-

-#endif

-

-#else

-  //

-  // SIMT

-  //

-

-  return sub_group_scan_inclusive_max(v);

-

-#endif

-}

-

-//

-//

-//

-

-static

-float

-skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  return v.s1;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  return v.s3;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  return v.s7;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  return v.sf;

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

-

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_UINT

-skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  return v.s1;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  return v.s3;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  return v.s7;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  return v.sf;

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

-

-#endif

-}

-

-//

-//

-//

-

-static

-float

-skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-#else

-  return v.s0;

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return sub_group_broadcast(v,0);

-

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,

-                      SKC_RASTERIZE_UINT  const i)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return v;

-#else

-  return shuffle(v,i);

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return intel_sub_group_shuffle(v,i);

-

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous

-                          SKC_RASTERIZE_FLOAT const c) // current

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  // FIXME -- there are alternative formulations here:

-  //

-  // Option 1:

-  //

-  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))

-  //

-  // Option 2:

-  //

-  //   p is a scalar

-  //   t    = c.rotate(+1)

-  //   t.s0 = p;

-  //

-  // Option 3: ...

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return p;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  return shuffle2(p,c,(uint2)(1,2));

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  return shuffle2(p,c,(uint4)(3,4,5,6));

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return intel_sub_group_shuffle_up(p,c,1);

-

-#endif

-}

-

-//

-//

-//

-

-static

-bool

-skc_is_lane_first()

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)

-  //

-  // SIMD

-  //

-  return true;

-#else

-  //

-  // SIMT

-  //

-  return get_sub_group_local_id() == 0;

-#endif

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_delta_offset()

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-  return 1;

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )

-  return (SKC_RASTERIZE_FLOAT)( 1, 2 );

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )

-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )

-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );

-#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )

-  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );

-#endif

-

-#else

-  //

-  // SIMT

-  //

-  return 1.0f + get_sub_group_local_id();

-

-#endif

-

-}

-

-//

-//

-//

-

-static

-int

-skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  return any(p);

-#else

-  //

-  // SIMT

-  //

-  return sub_group_any(p);

-#endif

-}

-

-//

-//

-//

-

-#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)

-

-void

-skc_segment_next(__global union skc_bp_elem * const bp_elems,

-                 skc_uint                   * const nodeword,

-                 skc_block_id_t             * const id)

-{

-  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)

-    {

-      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))

-        {

-          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;

-        }

-

-      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;

-

-      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

-    }

-}

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)

-{

-  return native_sqrt(x * x + y * y);

-}

-

-//

-// Wang's Formula (1985)

-//

-

-#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned

-

-#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)

-

-#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))

-#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))

-

-#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)

-#define SKC_WANG_SQRT(x)      native_sqrt(x)

-

-//

-//

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,

-                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,

-                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,

-                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)

-{

-  //

-  // Return the number of evenly spaced (in the parametric sense) line

-  // segments that are guaranteed to be within "epsilon" error of the

-  // curve.

-  //

-  // We're then going to take multiples of the reciprocal of this

-  // number so that the segmentation can be distributed across the

-  // subgroup.

-  //

-  // Note, this can probably be slightly optimized per architecture

-  // but it's probably far from being a hotspot since it's all

-  // straight-line unpredicated code.

-  //

-  // The result is an integer ranging from [1.0,#segments]

-  //

-  // Note that even if all of the control points are coincident, the

-  // max(1.0f) will categorize this as a line of 1 segment.

-  //

-  // This is what we want!  We want to convert cubics to lines as

-  // easily as possible and *then* cull lines that are either

-  // horizontal or zero length.

-  //

-  return max(1.0f,

-             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *

-                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),

-                                                    fabs(t3x - 2.0f * t2x + t1x)),

-                                                max(fabs(t2y - 2.0f * t1y + t0y),

-                                                    fabs(t3y - 2.0f * t2y + t1y))))));

-}

-

-static

-SKC_RASTERIZE_FLOAT

-skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,

-                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,

-                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)

-{

-  return max(1.0f,

-             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *

-                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),

-                                                fabs(t2y - 2.0f * t1y + t0y)))));

-}

-

-//

-// rational curves

-//

-

-static

-SKC_RASTERIZE_FLOAT

-skc_wangs_formula_cubic_rat()

-{

-  return 0.0f;

-}

-

-static

-SKC_RASTERIZE_FLOAT

-skc_wangs_formula_quad_rat()

-{

-  return 0.0f;

-}

-

-//

-// flush any work-in-progress blocks and return unused block ids

-//

-

-static

-void

-skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,

-             __global union skc_bp_elem                 * const bp_elems,

-             __global uint                              * const bp_ids,

-             skc_uint                                     const bp_mask,

-             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,

-             skc_block_id_v_t                           * const blocks,

-             skc_uint                                     const blocks_next,

-             skc_ttsk_v_t                               * const sk_v,

-             skc_uint                                     const sk_v_next,

-             __global skc_ttsk_s_t                      * const sk_extent,

-             __local  struct skc_subgroup_smem volatile * const smem)

-{

-  //

-  // flush non-empty bins

-  //

-  // FIXME -- accelerate this iteration/search with a subgroup operation

-  //

-  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)

-    {

-      if (smem->bin.aN.count[ii] > 0)

-        {

-          skc_block_id_v_t const id  = smem->bin.aN.id[ii];

-          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

-          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];

-#if 0

-          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);

-#endif

-          bp_elems[idx].u32 = tts;

-        }

-

-      //

-      // FIXME -- vectorize with vstoreN()

-      //

-    }

-

-  //

-  // return remaining block ids back to the pool

-  //

-  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;

-

-  if (blocks_rem > 0)

-    {

-      skc_uint bp_idx = 0;

-

-      if (skc_subgroup_lane() == 0)

-        {

-          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);

-

-#if 0

-          printf("r-: %8u + %u\n",bp_idx,blocks_rem);

-#endif

-        }

-

-      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;

-

-      if (skc_subgroup_lane() >= blocks_next)

-        {

-          bp_ids[bp_idx] = *blocks;

-        }

-    }

-

-  //

-  // flush work-in-progress ryx keys

-  //

-  if (sk_v_next > 0)

-    {

-      skc_uint sk_idx = 0;

-

-      if (skc_subgroup_lane() == 0)

-        {

-          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE

-            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);

-#if 0

-          printf("* %u\n",sk_idx);

-#endif

-        }

-

-      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

-

-      if (skc_subgroup_lane() < sk_v_next)

-        {

-          sk_extent[sk_idx] = *sk_v;

-        }

-    }

-}

-

-//

-// If there are lanes that were unable to append to a bin because

-// their hashes collided with a bin's current ryx key then those bins

-// must be ejected.

-//

-// Note that we do not eject "full" bins because lazily waiting for a

-// collision results in simpler code.

-//

-

-static

-void

-skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,

-          __global union skc_bp_elem                 * const bp_elems,

-          __global uint                              * const bp_ids,

-          skc_uint                                     const bp_mask,

-          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,

-          skc_block_id_t                             * const subblocks,

-          skc_block_id_v_t                           * const blocks,

-          skc_uint                                   * const blocks_next,

-          skc_ttsk_v_t                               * const sk_v,

-          skc_uint                                   * const sk_v_next,

-          __global skc_ttsk_s_t                      * const sk_extent,

-          __local  struct skc_subgroup_smem volatile * const smem,

-          SKC_RASTERIZE_UINT                           const hash,

-          SKC_RASTERIZE_UINT                           const yx,

-          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value

-{

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-

-  //

-  // FIXME -- this code is now stale with the changes to the

-  // subblock/block allocation strategy

-  //

-

-  //

-  // get local TTSB ID queue count

-  //

-  skc_uint ttsb_id_count  = smem->pool.count; // scalar

-

-  // init hash bit mask

-  skc_uint component_mask = 0;

-

-  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)

-    {

-      // if no collision continue

-      if (((int*)&is_collision)[cc] == 0)

-        continue;

-

-      uint const winner        = ((uint*)&hash)[cc];

-      uint const component_bit = 1u << winner;

-

-      // if already processed this hash then continue

-      if (component_mask & component_bit)

-        continue;

-

-      // update component mask

-      component_mask |= component_bit;

-

-      //

-      // new winner requires ejecting the old TTSB

-      //

-      if (smem->bin.aN.count[winner] > 0)

-        {

-          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

-

-          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];

-        }

-

-        //

-        // ensure there is at least one TTSK and TTSB ID

-        //

-        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)

-          {

-            //

-            // update remaining count

-            //

-            ttsb_id_count = 0;

-

-            //

-            // flush accumulated ttsk_ryx keys

-            //

-            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE

-              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count

-

-#if 0

-            printf("# %u\n",idx);

-#endif

-

-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)

-              {

-                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);

-              }

-

-            //

-            // allocate more ttsb ids from pool

-            //

-            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads

-

-            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)

-              smem->pool.aN.id[ii] = bp_ids[id + ii];

-          }

-

-      //

-      // invalidate the winning block

-      //

-

-      //

-      // update bin with winning yx, new ttsb id and zero count

-      //

-      // all lanes are loading/storing from/to the same index

-      //

-      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );

-      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];

-      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];

-      smem->bin.aN.count[winner] = 0;

-

-      //

-      // update count

-      //

-      ttsb_id_count += 1;

-    }

-

-  //

-  // save count

-  //

-  smem->pool.count = ttsb_id_count;

-

-#else

-  //

-  // SIMT

-  //

-

-  do {

-    //

-    // only one lane will win!

-    //

-    if (is_collision)

-      smem->subgroup.winner = hash;

-

-    barrier(CLK_LOCAL_MEM_FENCE);

-

-    //

-    // which bin is being ejected?

-    //

-    skc_uint const winner = smem->subgroup.winner;

-

-    //

-    // which colliding hash is taking over the bin?

-    //

-    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);

-

-    //

-    // all lanes with the same hash will try to store but only one

-    // lane will win

-    //

-    if (is_winner)

-      smem->subgroup.winner = yx;

-

-    barrier(CLK_LOCAL_MEM_FENCE);

-

-    //

-    // flush this block to the pool

-    //

-    if (smem->bin.aN.count[winner] > 0)

-      {

-        skc_block_id_v_t const id  = smem->bin.aN.id[winner];

-        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

-        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];

-#if 0

-        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);

-#endif

-        bp_elems[idx].u32 = tts;

-      }

-

-    //

-    // append new ttsk

-    //

-    skc_uint       const new_yx = smem->subgroup.winner;

-    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),

-                                                    blocks_next,

-                                                    bp_atomics,

-                                                    bp_mask, // pow2 modulo mask for block pool ring

-                                                    bp_ids,

-                                                    cohort_atomics,

-                                                    sk_v,

-                                                    sk_v_next,

-                                                    sk_extent,

-                                                    new_yx);

-

-#if 0

-    if (get_sub_group_local_id() == 0) {

-      printf(">>> %9u\n",new_id);

-    }

-#endif

-

-    //

-    // update bin with winning yx, new ttsb id and zero count

-    //

-    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;

-    smem->bin.aN.yx   [winner]                      = new_yx;

-    smem->bin.aN.id   [winner]                      = new_id;

-    smem->bin.aN.count[winner]                      = 0;

-

-    //

-    // remove all lanes matching this hash

-    //

-    is_collision = is_collision && !is_winner;

-

-    //

-    // exit if nothing left to do

-    //

-  } while (sub_group_any(is_collision));

-

-#endif

-}

-

-//

-// scatter scan max

-//

-static

-SKC_RASTERIZE_UINT

-skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,

-                     SKC_RASTERIZE_FLOAT                         const iss,

-                     SKC_RASTERIZE_FLOAT                         const ess)

-{

-  //

-  // prefix sums determine which lanes we're going to work on next

-  //

-  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);

-  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-#ifdef SKC_RASTERIZE_SIMD_USES_SMEM

-  //

-  // SIMD APPROACH 1: SIMT'ISH

-  //

-

-  // zero the volatile smem scratchpad using vector syntax

-  smem->subgroup.vN.scratch[0] = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                         \

-  if (is_scratch_store C)                               \

-    smem->subgroup.aN.scratch[scratch_idx C] = I;

-

-  SKC_RASTERIZE_VECTOR_EXPAND();

-

-  // propagate lanes to right using max scan

-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];

-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);

-

-#else

-  //

-  // SIMD APPROACH 2: SCALAR'ISH

-  //

-

-  SKC_RASTERIZE_UINT source = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                 \

-  if (is_scratch_store C)                       \

-    ((uint *)&source)[scratch_idx C] = I;

-

-  SKC_RASTERIZE_VECTOR_EXPAND();

-

-  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)

-    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);

-#endif

-

-#else

-  //

-  // SIMT

-  //

-

-  //

-  // zero the volatile smem scratchpad using vector syntax

-  //

-  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );

-

-  //

-  // store source lane at starting lane

-  //

-  if (is_scratch_store)

-    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();

-

-  //

-  // propagate lanes to right using max scan

-  //

-  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];

-  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);

-#endif

-

-  return source;

-}

-

-//

-// sliver lines into subpixels

-//

-

-static

-void

-skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,

-           __global union skc_bp_elem                 * const bp_elems,

-           __global uint                              * const bp_ids,

-           skc_uint                                     const bp_mask,

-           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,

-           skc_block_id_t                             * const subblocks,

-           skc_block_id_v_t                           * const blocks,

-           skc_uint                                   * const blocks_next,

-           skc_ttsk_v_t                               * const sk_v,

-           skc_uint                                   * const sk_v_next,

-           __global skc_ttsk_s_t                      * const sk_extent,

-           __local  struct skc_subgroup_smem volatile * const smem,

-           SKC_RASTERIZE_FLOAT                          const l0x,

-           SKC_RASTERIZE_FLOAT                          const l0y,

-           SKC_RASTERIZE_FLOAT                          const l1x,

-           SKC_RASTERIZE_FLOAT                          const l1y)

-{

-  //

-  // Y-SLIVERING

-  // -----------

-  //

-  // immediately sliver all multi-pixel lines in into 1-pixel high

-  // lines

-  //

-  // note this implicitly squelches horizontal lines

-  //

-  // there is another test for horizontal lines after x-slivering

-  // is complete

-  //

-

-  //

-  // will we need to flip the sign of y_delta ?

-  //

-  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);

-  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;

-

-  //

-  // save 1/dy

-  //

-  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);

-

-  //

-  // how many non-horizontal subpixel y-axis slivers are there?

-  //

-  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);

-  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);

-  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;

-  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;

-

-  //

-  // inclusive subgroup scan of y_segs

-  //

-  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);

-  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;

-  float                     y_rem   = skc_subgroup_last_float(y_iss);

-

-  //

-  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails

-  //

-  if (y_segs == 0.0f)

-    y_iss = 0.0f;

-

-#if 0

-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);

-#endif

-

-  //

-  // these values don't matter on first iteration

-  //

-  SKC_RASTERIZE_FLOAT n1x_prev = 0;

-  SKC_RASTERIZE_FLOAT n1y_prev = 0;

-

-  //

-  // loop until done

-  //

-  while (y_rem > 0.0f)

-    {

-      //

-      // distribute work across lanes

-      //

-      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);

-

-      //

-      // get line at y_source line

-      //

-      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);

-      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);

-      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);

-      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);

-

-      //

-      // every lane will create a 1 pixel tall line "sliver"

-      //

-      // FIXME -- this gets expanded on SIMD

-      //

-      // if numerator == 1 then this is the first lane

-      // if numerator == s then this is the last  lane

-      //

-      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);

-      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);

-

-      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);

-      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);

-

-      // toggle y_delta sign

-      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));

-

-      //

-      // calculate "right" line segment endpoint

-      //

-      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;

-      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);

-      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));

-

-      //

-      // override c1 if this is last point

-      //

-      n1y = select(n1y,m1y,is_y_last);

-      n1x = select(n1x,m1x,is_y_last);

-

-      //

-      // shuffle up "left" line segment endpoint

-      //

-      // NOTE: Intel's shuffle_up is unique with its elegant

-      // "previous" argument so don't get used to it

-      //

-      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);

-      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);

-

-      //

-      // override shuffle up if this is the first line segment

-      //

-      n0y = select(n0y,m0y,is_y_first);

-      n0x = select(n0x,m0x,is_y_first);

-

-      //

-      // save previous right endpoint

-      //

-      n1x_prev = n1x;

-      n1y_prev = n1y;

-

-      //

-      // decrement by subgroup size

-      //

-      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-

-#if 0

-      //

-      // debug

-      //

-      if (n0y != n1y) {

-        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);

-      }

-#endif

-

-      //

-      // X-SLIVERING

-      // -----------

-      //

-      // now sliver 1-pixel high lines into at either vertical or

-      // 1-pixel wide lines

-      //

-      // save original direction and work with increasing x

-      //

-      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);

-      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;

-

-      //

-      // save 1/dy

-      //

-      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);

-

-      //

-      // how many non-horizontal subpixel y-axis slivers are there?

-      //

-      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);

-      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);

-      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;

-      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);

-

-      //

-      // inclusive subgroup scan of y_segs

-      //

-      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);

-      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;

-      float                     x_rem    = skc_subgroup_last_float(x_iss);

-

-      //

-      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails

-      //

-      //if (x_segs == 0.0f)

-      // x_iss = 0.0f;

-

-      //

-      // these values don't matter on first iteration

-      //

-      SKC_RASTERIZE_FLOAT       p1x_prev = 0;

-      SKC_RASTERIZE_FLOAT       p1y_prev = 0;

-

-      //

-      // loop until done

-      //

-      while (x_rem > 0)

-        {

-          //

-          // distribute work across lanes

-          //

-          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);

-

-          //

-          // get line at y_source line

-          //

-          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);

-          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);

-          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);

-          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);

-

-          //

-          // every lane will create a 1 pixel tall line "sliver"

-          //

-          // FIXME -- this gets expanded on SIMD

-          //

-          // if numerator == 1 then this is the first lane

-          // if numerator == s then this is the last  lane

-          //

-          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);

-          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);

-

-          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);

-          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);

-

-          // toggle x_delta sign

-          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));

-

-          //

-          // calculate "right" line segment endpoint

-          //

-          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;

-          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);

-          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));

-

-          //

-          // override c1 if this is last point

-          //

-          p1x = select(p1x,o1x,is_x_last);

-          p1y = select(p1y,o1y,is_x_last);

-

-          //

-          // shuffle up "left" line segment endpoint

-          //

-          // NOTE: Intel's shuffle_up is unique with its elegant

-          // "previous" argument so don't get used to it

-          //

-          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);

-          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);

-

-          //

-          // override shuffle up if this is the first line segment

-          //

-          p0x = select(p0x,o0x,is_x_first);

-          p0y = select(p0y,o0y,is_x_first);

-

-          //

-          // save previous right endpoint

-          //

-          p1x_prev = p1x;

-          p1y_prev = p1y;

-

-          //

-          // decrement by subgroup size

-          //

-          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-

-          //

-          // only non-horizontal subpixel lines are valid

-          //

-          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);

-

-          //

-          // if no lanes are active then continue

-          //

-          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY

-          // IMPACTS PERFORMANCE (+12% ?)

-          //

-          // IT SHOULDN'T !!!

-          //

-#if 0

-          if (!skc_subgroup_any(is_active))

-            continue;

-#endif

-

-          //

-          // Option 1: use SLM for explicitly managed coalesced stores

-          //

-          // 1. which tile does this line belong?

-          // 2. hash tile coordinates

-          // 3. lookup hash

-          // 4. if tile matches then SLM append keys

-          // 5. if tile doesn't match

-          //   a. flush

-          //   b. create new TTSK_RYX

-          //   c. obtain TTSB block from pool

-          //   d. goto 3.

-          //

-

-          //

-          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores

-          //

-          // 1. which tile does this line belong?

-          // 2. hash tile coordinates

-          // 3. lookup hash

-          // 4. if tile matches then GMEM append keys

-          // 5. if tile doesn't match

-          //   a. flush (and invalidate empty elems)

-          //   b. create new TTSK_RYX

-          //   c. obtain TTSB block from pool

-          //   d. goto 3.

-          //

-

-          //

-          // The virtual rasterization surface is very large and

-          // signed: +/- ~64K-256K, depending on the architecture.

-          //

-          // Rasters must be clipped to the virtual surface and,

-          // optionally, clipped even further on a per raster

-          // basis.

-          //

-

-          //

-          // Clip to the per-raster clip

-          //

-

-          /*

-

-            CLIP HERE

-

-          */

-

-          //

-          // Hash the tile coordinates

-          //

-          // This table lists nominal values for each architecture.

-          // We want to choose values that are naturally fit the

-          // "width" of the architecture.

-          //

-          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS

-          //   ----  -------  ----  ---------  --------  ---------

-          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?

-          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*

-          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?

-          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*

-          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon

-          //

-          // NOTE: When possible, bias the hash toward using more y

-          // bits because of:

-          //

-          //   1. the 90 degree counter-clockwise rotation that we put

-          //      in place to offset the render-time clockwise

-          //      rotation

-          //

-          //   2. the likely presence of left-to-right or

-          //      right-to-left glyphs.

-          //

-          // For power-of-two bins, the hash is easy.

-          //

-          // For non-power-of-two, we may want to either implement a

-          // fast mod (compiler should do this for us... hahahaha) or

-          // drop down to the next power-of-two.

-          //

-

-          //

-          // FIXME -- this snarl is not good -- can probably reduce

-          // some of the sign casting but some is there to vectorize a

-          // scalar

-          //

-          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);

-          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);

-

-          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);

-          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);

-

-          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);

-          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);

-

-          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;

-

-          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;

-          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);

-

-          //

-          // map [+1,+32] to [ 0,+31]

-          // map [-1,-32] to [-1,-32]

-          //

-          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;

-

-          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);

-          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);

-          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;

-

-          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;

-          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);

-

-          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;

-

-          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |

-                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));

-

-          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));

-

-#if 0

-          printf("(%3u, %3u)\n",tile_y,tile_x);

-#endif

-

-#if 0

-          if (is_active)

-            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);

-#endif

-

-          //

-          // debug

-          //

-#if 0 // PRINTF_ENABLE

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                         \

-          if (is_active C)                                              \

-            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);

-

-          SKC_RASTERIZE_VECTOR_EXPAND();

-#else

-          if (is_active)

-            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);

-#endif

-

-#endif

-          //

-          // flush all active lanes

-          //

-          while (true)

-            {

-              //

-              // either gather load or vector load+shuffle the yx keys

-              //

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;

-              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);

-#else

-              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];

-#endif

-

-              //

-              // does yx for lane match yx for hash?

-              //

-              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;

-              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);

-

-              //

-              // OpenCL spec: "When casting a bool to a vector integer

-              // data type, the vector components will be set to -1

-              // (i.e. all bits set) if the vector bool value is true

-              // and 0 otherwise.

-              //

-#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )

-              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;

-#else

-              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}

-#endif

-              //

-              // how many new elements for each matching hash bin?

-              //

-              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;

-              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;

-

-              //

-              // prefix sum all of the bins in parallel

-              //

-              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);

-              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);

-

-              //

-              // current bin counts

-              //

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;

-              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);

-#else

-              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];

-#endif

-

-              //

-              // calculate where each cache-hit and in-bounds tts should be stored

-              //

-              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;

-              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;

-

-              //

-              // which lanes can append to a matching bin?

-              //

-              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);

-

-              //

-              // scatter append tts elements to bin blocks

-              //

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)

-              //

-              // SIMD

-              //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                         \

-              if (is_append C)                                          \

-                {                                                       \

-                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \

-                  smem->bin.aN.count[hash C]               = count_new C; \

-                }

-

-              SKC_RASTERIZE_VECTOR_EXPAND();

-#else

-              //

-              // SIMT

-              //

-              if (is_append)

-                {

-                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;

-                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS

-                }

-#endif

-              //

-              // try to keep predicate updates SIMD-friendly and

-              // outside of predicated code paths -- this is not

-              // always how we would normally do things on SIMT but

-              // either approach is acceptable

-              //

-

-              //

-              // mask off lanes/components that successfully appended

-              //

-              is_active = is_active && !is_append;

-

-              //

-              // are there any active lanes left?

-              //

-              if (!skc_subgroup_any(is_active))

-                break;

-

-              //

-              // There are active lanes that couldn't be appended to a

-              // bin because their hashes collided with the bin's

-              // current ryx key then those bins must be ejected.

-              //

-              // Note that we do not eject "full" bins because lazily

-              // waiting for a collision results in simpler code.

-              //

-              skc_flush(bp_atomics,

-                        bp_elems,

-                        bp_ids,

-                        bp_mask,

-                        cohort_atomics,

-                        subblocks,

-                        blocks,

-                        blocks_next,

-                        sk_v,

-                        sk_v_next,

-                        sk_extent,

-                        smem,

-                        hash,

-                        yx,

-                        is_active);

-            }

-        }

-    }

-}

-

-//

-// INITIALIZE SMEM

-//

-// Note that SIMD/SIMT have nearly the same syntax.

-//

-static

-void

-skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)

-{

-  //

-  // initialize smem bins

-  //

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );

-  smem->bin.vN.count = ( 0 );

-#else

-  //

-  // SIMT

-  //

-  int idx = skc_subgroup_lane();

-

-#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )

-  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)

-#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )

-  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)

-#endif

-    {

-      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );

-      smem->bin.aN.count[idx] = ( 0 );

-    }

-#endif

-}

-

-//

-// RASTERIZE CUBIC KERNEL

-//

-

-static

-void

-skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                     __global union skc_bp_elem                * const bp_elems,

-                     __global uint                             * const bp_ids,

-                     skc_uint                                    const bp_mask,

-

-                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                     __global skc_ttsk_s_t                     * const sk_extent,

-

-                     __local struct skc_subgroup_smem volatile * const smem,

-

-                     skc_uint                                  * const nodeword,

-                     skc_block_id_t                            * const id,

-

-                     union skc_transform              const    * const tv,

-                     union skc_path_clip              const    * const cv,

-                     skc_uint                                    const cohort)

-{

-  //

-  // the initial segment idx and segments-per-block constant determine

-  // how many block ids will need to be loaded

-  //

-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  //

-  // apply transform

-  //

-  // note that we only care if the end points are rounded to subpixel precision

-  //

-  // FIXME -- transformation is currently affine-only support perspective later

-  //

-  // the affine transformation requires 8 FMA + 2 ROUND operations

-  //

-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);

-

-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;

-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

-

-  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;

-  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;

-

-  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);

-

-  //

-  //

-  //

-#if PRINTF_ENABLE

-

-#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                         \

-  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \

-         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \

-         b0x C,b0y C,t1x C,t1y C,                                       \

-         t2x C,t2y C,t3x C,t3y C);

-

-  SKC_RASTERIZE_VECTOR_EXPAND();

-

-#else

-

-  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",

-         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

-

-#endif

-

-#endif

-

-  //

-  // OLD APPROACH

-  // ------------

-  //

-  // The Spinel CUDA rasterizer was significantly more complex and

-  // performed a few different tasks that are probably best kept

-  // separate.

-  //

-  // The Spinel rasterizer Bezier held 4-element x and y coordinates

-  // in adjacent lanes. This simplified intermingling of single lane

-  // 4-coordinate line segments with two-lane cubic Beziers.

-  //

-  // After transformation of the input segments, the Spinel rasterizer

-  // would test cubics for flatness and, if flat, collapse the

-  // adjacent lanes into a single line lane and an empty lane.

-  //

-  // Any lines would then be appended to a line queue.

-  //

-  // Any cubics would then be subdivided.

-  //

-  // The reclassification process would be repeated.

-  //

-  // NEW APPROACH

-  // ------------

-  //

-  // Assume we're only working with cubics in this kernel.

-  //

-  // Optimization: if the line segment is a special case -- a cusp,

-  // has 1+ inflections, or a loop -- it might be beneficial to

-  // subdivide the control cage 1+ times in order to separate the

-  // flatter segments the high-velocity region(s).

-  //

-  // This means we want to split using [a,b] formulation to _directly_

-  // subdivide producing a new control cage.

-  //

-  // Wang's Formula is still useful even if we subdivide once or twice

-  // as it's so cheap that it might give some useful hints about where

-  // the high-velocity sections of curve reside.

-  //

-  // But it seems like using Wang's and directly flattening to line

-  // segments without any subdivision is good enough for the limited

-  // set of test cases that I've tried.

-  //

-  // So... use Wang's Formula to estimate how many line segment are

-  // required to properly flatten the cubics.

-  //

-  // Then use inclusive/exclusive scans to put all the lanes to work:

-  //

-  //   1. segmenting cubics to line segments

-  //

-  //   2. slivering line segments into 1-pixel high line segments

-  //

-  //   3. slivering 1-pixel high line segments into 1-pixel wide line

-  //      segments

-  //

-  // MORE BACKGROUND ON NEW APPROACH

-  // -------------------------------

-  //

-  // Two options for handling line segments:

-  //

-  // 1. append the line segments onto an SLM array until enough

-  //    work has been accrued (Spinel does this)

-  //

-  // 2. immediately sliver the potentially multi-pixel line

-  //    segments into subpixel lines

-  //

-  // The advantage of (1) is that it guarantees the slivering

-  // process will, on average, always be emitting a full subgroup

-  // of subpixel lines.

-  //

-  // The advantage of (2) is that it reduces code complexity and

-  // leaves more room for SLM tile bins. The difference between Spinel

-  // and Skia Compute is that Wang's Formula guarantees there will be

-  // a full subgroup of multi-pixel lines unless this is the final

-  // iteration of the warp of multi-pixel lines.

-  //

-  // Note that wider GPU architectures might benefit from (1) and

-  // other work accumulation strategies because it will minimize

-  // partial warp workloads in the final iteration of each stage.  It

-  // also minimizes the sunk cost of the uniform control logic steps.

-  //

-  // So let's implement (2) for now...

-  //

-

-  //

-  // And... begin!

-  //

-  // Estimate how many line segments are in quad/cubic curve.

-  //

-  // Wang's Formula will return zero if the control points are

-  // collinear but we bump it up to 1.0f.

-  //

-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

-

-  //

-  // if there are free registers then precalculate the reciprocal for

-  // each estimated segments since it will never change

-  //

-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);

-

-

-  //

-  // inclusive add scan of estimated line segments

-  // exclusive add scan of estimated line segments

-  // total number       of estimated line segments

-  //

-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);

-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;

-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

-

-  //

-  // Precompute cubic polynomial coefficients from transformed control

-  // cage so we can shuffle them in on each iteration of the outer

-  // loop and then evaluate the polynomial in Horner form.

-  //

-  //                            |  1  0  0  0 | | c0 |

-  //                            |             | |    |

-  //                            | -3  3  0  0 | | c1 |

-  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |

-  //                            |  3 -6  3  0 | | c2 |

-  //                            |             | |    |

-  //                            | -1  3 -3  1 | | c3 |

-  //

-  //

-  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL

-  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL

-

-  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL

-  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL

-

-  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB

-  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB

-

-  //

-  // these values don't matter on the first iteration

-  //

-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;

-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;

-

-  //

-  // allocate and init in-register TTSK keys

-  //

-  skc_uint     sk_v_next = 0;

-  skc_ttsk_v_t sk_v; 

-

-  sk_v.hi = cohort;

-

-  //

-  // initialize smem

-  //

-  skc_smem_init(smem);

-

-  //

-  // initialize blocks / subblocks

-  //

-  skc_block_id_v_t blocks;

-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

-

-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

-  skc_block_id_t   subblocks   = 0;

-#endif

-

-  //

-  // loop until done

-  //

-  while (s_rem > 0)

-    {

-      //

-      // distribute work across lanes

-      //

-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

-

-      //

-      // every lane has a fraction to work off of

-      //

-      // FIXME -- this gets expanded on SIMD

-      //

-      // if delta == 1      then this is the first lane

-      // if count == s_segs then this is the last  lane

-      //

-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);

-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

-

-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);

-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

-

-      //

-      // init parametric t

-      //

-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

-

-      //

-      // if last then override to a hard 1.0f

-      //

-      s_t    = is_s_last ? 1.0f : s_t;

-

-      //

-      // decrement by subgroup size

-      //

-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-

-      //

-      // now every lane knows what to do and the following lines will

-      // pump out up to SUBGROUP_SIZE line segments

-      //

-      // obtain the src vertices through shared or via a shuffle

-      //

-

-      //

-      // shuffle in the polynomial coefficients their source lane

-      //

-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);

-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

-

-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);

-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

-

-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);

-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

-

-      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);

-      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);

-

-      //

-      // calculate "right" line segment endpoint using Horner form

-      //

-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND

-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND

-

-      //

-      // shuffle up "left" line segment endpoint

-      //

-      // NOTE: Intel's shuffle_up is unique with its elegant

-      // "previous" argument so don't get used to it

-      //

-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);

-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

-

-      //

-      // save previous right endpoint

-      //

-      l1x_prev = l1x;

-      l1y_prev = l1y;

-

-      //

-      // override shuffle up if this is the first line segment

-      //

-      l0x = select(l0x,s0x,is_s_first);

-      l0y = select(l0y,s0y,is_s_first);

-

-      //

-      // sliver lines

-      //

-      skc_sliver(bp_atomics,

-                 bp_elems,

-                 bp_ids,

-                 bp_mask,

-                 cohort_atomics,

-                 &subblocks,

-                 &blocks,

-                 &blocks_next,

-                 &sk_v,

-                 &sk_v_next,

-                 sk_extent,

-                 smem,

-                 l0x,l0y,l1x,l1y);

-    }

-

-  //

-  // - flush work-in-progress blocks

-  // - return unused block ids

-  //

-  skc_finalize(bp_atomics,

-               bp_elems,

-               bp_ids,

-               bp_mask,

-               cohort_atomics,

-               &blocks,

-               blocks_next,

-               &sk_v,

-               sk_v_next,

-               sk_extent,

-               smem);

-}

-

-//

-// RASTERIZE QUAD KERNEL

-//

-

-static

-void

-skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                    __global union skc_bp_elem                * const bp_elems,

-                    __global uint                             * const bp_ids,

-                    skc_uint                                    const bp_mask,

-

-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                    __global skc_ttsk_s_t                     * const sk_extent,

-

-                    __local struct skc_subgroup_smem volatile * const smem,

-                    

-                    skc_uint                                  * const nodeword,

-                    skc_block_id_t                            * const id,

-

-                    union skc_transform              const    * const tv,

-                    union skc_path_clip              const    * const cv,

-                    skc_uint                                    const cohort)

-{

-  //

-  // the initial segment idx and segments-per-block constant determine

-  // how many block ids will need to be loaded

-  //

-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  //

-  // apply transform

-  //

-  // note that we only care if the end points are rounded to subpixel precision

-  //

-  // FIXME -- transformation is currently affine-only support perspective later

-  //

-  // the affine transformation requires 8 FMA + 2 ROUND operations

-  //

-  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);

-

-  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;

-  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

-

-  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);

-

-  //

-  // Estimate how many line segments are in quad/cubic curve.

-  //

-  // Wang's Formula will return zero if the control points are

-  // collinear but we bump it up to 1.0f.

-  //

-  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);

-

-  //

-  // if there are free registers then precalculate the reciprocal for

-  // each estimated segments since it will never change

-  //

-  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);

-

-

-  //

-  // inclusive add scan of estimated line segments

-  // exclusive add scan of estimated line segments

-  // total number       of estimated line segments

-  //

-  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);

-  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;

-  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

-

-  //

-  // Precompute quadratic polynomial coefficients from control cage so

-  // we can shuffle them in on each iteration of the outer loop and

-  // then evaluate the polynomial in Horner form.

-  //

-

-  //                        |  1  0  0  | | c0 |

-  //                        |           | |    |

-  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |

-  //                        |           | |    |

-  //                        |  1 -2  1  | | c2 |

-  //

-  //

-  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL

-  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL

-

-  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD

-  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD

-

-  //

-  // these values don't matter on the first iteration

-  //

-  SKC_RASTERIZE_FLOAT l1x_prev  = 0;

-  SKC_RASTERIZE_FLOAT l1y_prev  = 0;

-

-  //

-  // allocate and init in-register TTSK keys

-  //

-  skc_uint     sk_v_next = 0;

-  skc_ttsk_v_t sk_v; 

-

-  sk_v.hi = cohort;

-

-  //

-  // initialize smem

-  //

-  skc_smem_init(smem);

-

-  //

-  // initialize blocks / subblocks

-  //

-  skc_block_id_v_t blocks;

-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

-

-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

-  skc_block_id_t   subblocks   = 0;

-#endif

-

-  //

-  // loop until done

-  //

-  while (s_rem > 0)

-    {

-      //

-      // distribute work across lanes

-      //

-      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

-

-      //

-      // every lane has a fraction to work off of

-      //

-      // FIXME -- this gets expanded on SIMD

-      //

-      // if delta == 1      then this is the first lane

-      // if count == s_segs then this is the last  lane

-      //

-      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);

-      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

-

-      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);

-      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

-

-      //

-      // init parametric t

-      //

-      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

-

-      //

-      // if last then override to a hard 1.0f

-      //

-      s_t    = is_s_last ? 1.0f : s_t;

-

-      //

-      // decrement by subgroup size

-      //

-      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

-

-      //

-      // now every lane knows what to do and the following lines will

-      // pump out up to SUBGROUP_SIZE line segments

-      //

-      // obtain the src vertices through shared or via a shuffle

-      //

-

-      //

-      // shuffle in the polynomial coefficients their source lane

-      //

-      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);

-      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

-

-      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);

-      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

-

-      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);

-      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

-

-      //

-      // calculate "right" line segment endpoint using Horner form

-      //

-      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND

-      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND

-

-      //

-      // shuffle up "left" line segment endpoint

-      //

-      // NOTE: Intel's shuffle_up is unique with its elegant

-      // "previous" argument so don't get used to it

-      //

-      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);

-      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

-

-      //

-      // save previous right endpoint

-      //

-      l1x_prev = l1x;

-      l1y_prev = l1y;

-

-      //

-      // override shuffle up if this is the first line segment

-      //

-      l0x = select(l0x,s0x,is_s_first);

-      l0y = select(l0y,s0y,is_s_first);

-

-      //

-      // sliver lines

-      //

-      skc_sliver(bp_atomics,

-                 bp_elems,

-                 bp_ids,

-                 bp_mask,

-                 cohort_atomics,

-                 &subblocks,

-                 &blocks,

-                 &blocks_next,

-                 &sk_v,

-                 &sk_v_next,

-                 sk_extent,

-                 smem,

-                 l0x,l0y,l1x,l1y);

-    }

-

-  //

-  // - flush work-in-progress blocks

-  // - return unused block ids

-  //

-  skc_finalize(bp_atomics,

-               bp_elems,

-               bp_ids,

-               bp_mask,

-               cohort_atomics,

-               &blocks,

-               blocks_next,

-               &sk_v,

-               sk_v_next,

-               sk_extent,

-               smem);

-}

-

-//

-// RASTERIZE LINE KERNEL

-//

-

-static

-void

-skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                    __global union skc_bp_elem                * const bp_elems,

-                    __global uint                             * const bp_ids,

-                    skc_uint                                    const bp_mask,

-

-                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                    __global skc_ttsk_s_t                     * const sk_extent,

-

-                    __local struct skc_subgroup_smem volatile * const smem,

-                    

-                    skc_uint                                  * const nodeword,

-                    skc_block_id_t                            * const id,

-

-                    union skc_transform              const    * const tv,

-                    union skc_path_clip              const    * const cv,

-                    skc_uint                                    const cohort)

-{

-  //

-  // the initial segment idx and segments-per-block constant determine

-  // how many block ids will need to be loaded

-  //

-  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-  skc_segment_next(bp_elems,nodeword,id);

-

-  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

-

-#if 0

-  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);

-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);

-#endif

-

-  //

-  // apply transform

-  //

-  // note that we only care if the end points are rounded to subpixel precision

-  //

-  // FIXME -- transformation is currently affine-only

-  // FIXME -- support perspective later

-  //

-  // the affine transformation requires 8 FMA + 4 ROUND operations

-  //

-  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);

-

-  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);

-  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);

-

-#if 0

-  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);

-#endif

-

-  //

-  // allocate and init in-register TTSK keys

-  //

-  skc_uint     sk_v_next = 0;

-  skc_ttsk_v_t sk_v; 

-

-  sk_v.hi = cohort;

-

-  //

-  // initialize smem

-  //

-  skc_smem_init(smem);

-

-  //

-  // initialize blocks / subblocks

-  //

-  skc_block_id_v_t blocks;

-  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

-

-#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

-  skc_block_id_t   subblocks   = 0;

-#endif

-

-  //

-  // sliver lines

-  //

-  skc_sliver(bp_atomics,

-             bp_elems,

-             bp_ids,

-             bp_mask,

-             cohort_atomics,

-             &subblocks,

-             &blocks,

-             &blocks_next,

-             &sk_v,

-             &sk_v_next,

-             sk_extent,

-             smem,

-             l0x,l0y,l1x,l1y);

-

-  //

-  // - flush work-in-progress blocks

-  // - return unused block ids

-  //

-  skc_finalize(bp_atomics,

-               bp_elems,

-               bp_ids,

-               bp_mask,

-               cohort_atomics,

-               &blocks,

-               blocks_next,

-               &sk_v,

-               sk_v_next,

-               sk_extent,

-               smem);

-}

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                         __global union skc_bp_elem                * const bp_elems,

-                         __global uint                             * const bp_ids,

-                         skc_uint                                    const bp_mask,

-

-                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                         __global skc_ttsk_s_t                     * const sk_extent,

-

-                         __global float8                  const    * const transforms, // FIXME -- __constant

-                         __global float4                  const    * const clips,      // FIXME -- __constant

-                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                         skc_uint                                    const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  __local struct skc_subgroup_smem volatile                smem[1];

-#else

-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];

-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();

-#endif

-  

-  //

-  // this is a subgroup/warp-centric kernel

-  //

-  // which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing

-  // get_group_id(0) as a uniform but the alternative calculation used

-  // when there are multiple subgroups per workgroup is not

-  // cooperating and driving spillage elsewhere.

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  uint const cmd_idx = get_group_id(0);

-#else

-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("+cmd_idx = %u\n",cmd_idx);

-#endif

-

-  //

-  // if worksgroups are multi-subgroup then there may be excess

-  // subgroups in the final workgroup

-  //

-  if (cmd_idx >= count)

-    return;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("-cmd_idx = %u\n",cmd_idx);

-#endif

-

-  //

-  // load a single command for this subgroup

-  //

-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("[ %u ]< %u, %u, %u, %u >\n",

-           cmd_idx,

-           cmd.nodeword,

-           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),

-           SKC_CMD_RASTERIZE_GET_CLIP(cmd),

-           SKC_CMD_RASTERIZE_GET_COHORT(cmd));

-#endif

-

-  //

-  // get first block node command word and its subblock

-  //

-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing

-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;

-  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);

-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

-

-  //

-  // load transform -- uniform across subgroup

-  //

-  // v8: { sx shx tx shy sy ty w0 w1 }

-  //

-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:

-  //

-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]

-  //

-  // Coordinates are scaled to subpixel resolution.  All that matters

-  // is that continuity is maintained between end path element

-  // endpoints.

-  //

-  // It's the responsibility of the host to ensure that the transforms

-  // are properly scaled either via intitializing a transform stack

-  // with the subpixel resolution scaled identity or scaling the

-  // transform before its loaded by a rasterization grid.

-  //

-  // FIXME -- horizontal load might be better than this broadcast load

-  //

-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load

-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load

-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

-

-  switch (tag)

-    {

-    case SKC_BLOCK_ID_TAG_PATH_LINE:

-      skc_rasterize_lines(bp_atomics,

-                          bp_elems,

-                          bp_ids,

-                          bp_mask,

-                          cohort_atomics,

-                          sk_extent,

-                          smem,

-                          &nodeword,&id,

-                          &tv,&cv,cohort);

-      break;

-

-    case SKC_BLOCK_ID_TAG_PATH_QUAD:

-      skc_rasterize_quads(bp_atomics,

-                          bp_elems,

-                          bp_ids,

-                          bp_mask,

-                          cohort_atomics,

-                          sk_extent,

-                          smem,

-                          &nodeword,&id,

-                          &tv,&cv,cohort);

-      break;

-

-    case SKC_BLOCK_ID_TAG_PATH_CUBIC:

-      skc_rasterize_cubics(bp_atomics,

-                           bp_elems,

-                           bp_ids,

-                           bp_mask,

-                           cohort_atomics,

-                           sk_extent,

-                           smem,

-                           &nodeword,&id,

-                           &tv,&cv,cohort);

-      break;

-

-    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:

-      break;

-    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:

-      break;

-

-    default:

-      break;

-    }

-}

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                           __global union skc_bp_elem                * const bp_elems,

-                           __global uint                             * const bp_ids,

-                           skc_uint                                    const bp_mask,

-

-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                           __global skc_ttsk_s_t                     * const sk_extent,

-

-                           __global float8                  const    * const transforms, // FIXME -- __constant

-                           __global float4                  const    * const clips,      // FIXME -- __constant

-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                           skc_uint                                    const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  __local struct skc_subgroup_smem volatile                smem[1];

-#else

-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];

-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();

-#endif

-  

-  //

-  // this is a subgroup/warp-centric kernel

-  //

-  // which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing

-  // get_group_id(0) as a uniform but the alternative calculation used

-  // when there are multiple subgroups per workgroup is not

-  // cooperating and driving spillage elsewhere.

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  uint const cmd_idx = get_group_id(0);

-#else

-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  //

-  // if worksgroups are multi-subgroup then there may be excess

-  // subgroups in the final workgroup

-  //

-  if (cmd_idx >= count)

-    return;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("cmd_idx = %u\n",cmd_idx);

-#endif

-

-  //

-  // load a single command for this subgroup

-  //

-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

-

-  //

-  // get first block node command word and its subblock

-  //

-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing

-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;

-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

-

-  //

-  // load transform -- uniform across subgroup

-  //

-  // v8: { sx shx tx shy sy ty w0 w1 }

-  //

-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:

-  //

-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]

-  //

-  // Coordinates are scaled to subpixel resolution.  All that matters

-  // is that continuity is maintained between end path element

-  // endpoints.

-  //

-  // It's the responsibility of the host to ensure that the transforms

-  // are properly scaled either via intitializing a transform stack

-  // with the subpixel resolution scaled identity or scaling the

-  // transform before its loaded by a rasterization grid.

-  //

-  // FIXME -- horizontal load might be better than this broadcast load

-  //

-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load

-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load

-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

-

-  skc_rasterize_lines(bp_atomics,

-                      bp_elems,

-                      bp_ids,

-                      bp_mask,

-                      cohort_atomics,

-                      sk_extent,

-                      smem,

-                      &nodeword,&id,

-                      &tv,&cv,cohort);

-}

-

-//

-//

-//

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                           __global union skc_bp_elem                * const bp_elems,

-                           __global uint                             * const bp_ids,

-                           skc_uint                                    const bp_mask,

-

-                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                           __global skc_ttsk_s_t                     * const sk_extent,

-

-                           __global float8                  const    * const transforms, // FIXME -- __constant

-                           __global float4                  const    * const clips,      // FIXME -- __constant

-                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                           skc_uint                                    const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  __local struct skc_subgroup_smem volatile                smem[1];

-#else

-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];

-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();

-#endif

-  

-  //

-  // this is a subgroup/warp-centric kernel

-  //

-  // which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing

-  // get_group_id(0) as a uniform but the alternative calculation used

-  // when there are multiple subgroups per workgroup is not

-  // cooperating and driving spillage elsewhere.

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  uint const cmd_idx = get_group_id(0);

-#else

-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  //

-  // if worksgroups are multi-subgroup then there may be excess

-  // subgroups in the final workgroup

-  //

-  if (cmd_idx >= count)

-    return;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("cmd_idx = %u\n",cmd_idx);

-#endif

-

-  //

-  // load a single command for this subgroup

-  //

-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

-

-  //

-  // get first block node command word and its subblock

-  //

-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing

-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;

-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

-

-  //

-  // load transform -- uniform across subgroup

-  //

-  // v8: { sx shx tx shy sy ty w0 w1 }

-  //

-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:

-  //

-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]

-  //

-  // Coordinates are scaled to subpixel resolution.  All that matters

-  // is that continuity is maintained between end path element

-  // endpoints.

-  //

-  // It's the responsibility of the host to ensure that the transforms

-  // are properly scaled either via intitializing a transform stack

-  // with the subpixel resolution scaled identity or scaling the

-  // transform before its loaded by a rasterization grid.

-  //

-  // FIXME -- horizontal load might be better than this broadcast load

-  //

-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load

-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load

-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

-

-  skc_rasterize_quads(bp_atomics,

-                      bp_elems,

-                      bp_ids,

-                      bp_mask,

-                      cohort_atomics,

-                      sk_extent,

-                      smem,

-                      &nodeword,&id,

-                      &tv,&cv,cohort);

-}

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                            __global union skc_bp_elem                * const bp_elems,

-                            __global uint                             * const bp_ids,

-                            skc_uint                                    const bp_mask,

-

-                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                            __global skc_ttsk_s_t                     * const sk_extent,

-

-                            __global float8                  const    * const transforms, // FIXME -- __constant

-                            __global float4                  const    * const clips,      // FIXME -- __constant

-                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                            skc_uint                                    const count)

-{

-  //

-  // declare shared memory block

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  __local struct skc_subgroup_smem volatile                smem[1];

-#else

-  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];

-  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();

-#endif

-  

-  //

-  // this is a subgroup/warp-centric kernel

-  //

-  // which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler appears to be recognizing

-  // get_group_id(0) as a uniform but the alternative calculation used

-  // when there are multiple subgroups per workgroup is not

-  // cooperating and driving spillage elsewhere.

-  //

-#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )

-  uint const cmd_idx = get_group_id(0);

-#else

-  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  //

-  // if worksgroups are multi-subgroup then there may be excess

-  // subgroups in the final workgroup

-  //

-  if (cmd_idx >= count)

-    return;

-

-#if 0

-  if (get_sub_group_local_id() == 0)

-    printf("cmd_idx = %u\n",cmd_idx);

-#endif

-

-  //

-  // load a single command for this subgroup

-  //

-  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

-

-  //

-  // get first block node command word and its subblock

-  //

-  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing

-  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;

-  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

-

-  //

-  // load transform -- uniform across subgroup

-  //

-  // v8: { sx shx tx shy sy ty w0 w1 }

-  //

-  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:

-  //

-  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]

-  //

-  // Coordinates are scaled to subpixel resolution.  All that matters

-  // is that continuity is maintained between end path element

-  // endpoints.

-  //

-  // It's the responsibility of the host to ensure that the transforms

-  // are properly scaled either via intitializing a transform stack

-  // with the subpixel resolution scaled identity or scaling the

-  // transform before its loaded by a rasterization grid.

-  //

-  // FIXME -- horizontal load might be better than this broadcast load

-  //

-  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load

-  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load

-  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

-

-  skc_rasterize_cubics(bp_atomics,

-                       bp_elems,

-                       bp_ids,

-                       bp_mask,

-                       cohort_atomics,

-                       sk_extent,

-                       smem,

-                       &nodeword,&id,

-                       &tv,&cv,cohort);

-}

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                               __global union skc_bp_elem                * const bp_elems,

-                               __global uint                             * const bp_ids,

-                               skc_uint                                    const bp_mask,

-

-                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                               __global skc_ttsk_s_t                     * const sk_extent,

-

-                               __global float8                  const    * const transforms, // FIXME -- __constant

-                               __global float4                  const    * const clips,      // FIXME -- __constant

-                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                               skc_uint                                    const count)

-{

-  ;

-}

-

-//

-//

-//

-

-__kernel

-SKC_RASTERIZE_KERNEL_ATTRIBS

-void

-skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,

-                                __global union skc_bp_elem                * const bp_elems,

-                                __global uint                             * const bp_ids,

-                                skc_uint                                    const bp_mask,

-

-                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,

-                                __global skc_ttsk_s_t                     * const sk_extent,

-

-                                __global float8                  const    * const transforms, // FIXME -- __constant

-                                __global float4                  const    * const clips,      // FIXME -- __constant

-                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant

-                                skc_uint                                    const count)

-{

-  ;

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+// #define SKC_ARCH_AVX2
+// #define SKC_RASTERIZE_SIMD_USES_SMEM
+
+#define PRINTF_ENABLE       0
+#define PRINTF_BLOCK_COUNT  0
+
+//
+// NOTE:
+//
+// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
+// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
+//
+// NOTE:
+//
+// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
+//
+//
+
+#if 0 // SKC_ARCH_AVX2
+
+// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
+// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
+// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1
+
+// #define SKC_TTXB_WORDS                           8
+
+// #define SKC_RASTERIZE_FLOAT                      float8
+// #define SKC_RASTERIZE_UINT                       uint8
+// #define SKC_RASTERIZE_INT                        int8
+// #define SKC_RASTERIZE_PREDICATE                  int8
+
+// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
+// #define SKC_RASTERIZE_BIN                        uint8
+
+// #define SKC_RASTERIZE_POOL                       uint8
+// #define SKC_RASTERIZE_POOL_SCALE                 6
+
+// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
+// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2
+
+// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()
+
+#endif
+
+//
+// SIMT
+//
+
+#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
+#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
+#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)
+
+//
+//
+//
+
+#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
+#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }
+
+//
+//
+//
+
+#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
+#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
+#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
+#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices. For example,
+// on GEN9 there is an LRP "linear interoplation" opcode but it
+// doesn't appear to support half floats.
+//
+// Feel free to toggle this option and then benchmark and inspect the
+// generated code.  We really want the double FMA to be generated when
+// there isn't support for a LERP/MIX operation.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)      mix(a,b,t)
+#endif
+
+//
+// There is no integer MAD in OpenCL with "don't care" overflow
+// semantics.
+//
+// FIXME -- verify if the platform needs explicit MAD operations even
+// if a "--fastmath" option is available at compile time.  It might
+// make sense to explicitly use MAD calls if the platform requires it.
+//
+
+#if 1
+#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
+#else
+#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
+#endif
+
+//
+//
+//
+
+#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())
+
+//
+//
+//
+
+union skc_bp_elem
+{
+  skc_uint              u32;
+  skc_tagged_block_id_t tag_id;
+  skc_float             coord;
+};
+
+//
+//
+//
+
+struct skc_subgroup_smem
+{
+  //
+  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
+  struct {
+    union {
+
+      skc_uint                winner;
+
+      struct {
+        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } aN;
+
+      struct {
+        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
+      } vN;
+    };
+  } subgroup;
+#endif
+
+  //
+  // work-in-progress TTSB blocks and associated YX keys
+  //
+  union {
+    struct {
+      // FIXME -- some typedefs are valid here
+      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
+      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+    } aN;
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+    struct {
+      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
+      SKC_RASTERIZE_BIN       yx;
+      SKC_RASTERIZE_BIN       id;
+      SKC_RASTERIZE_BIN       count;
+    } vN;
+#endif
+  } bin;
+};
+
+//
+//
+//
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+#define skc_subgroup_lane()  0
+#else
+#define skc_subgroup_lane()  get_sub_group_local_id()
+#endif
+
+//
+// replenish block ids
+//
+// note that you can't overrun the block id pool since it's a ring
+//
+
+static
+void
+skc_blocks_replenish(skc_uint                           * const blocks_next,
+                     skc_block_id_v_t                   * const blocks,
+                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                     __global skc_block_id_t   const    * const bp_ids)
+{
+  //
+  // get a new vector of block ids -- this is kind of a narrow
+  // allocation but subblocks help stretch out the pool.
+  //
+  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
+  //
+  skc_uint bp_idx = 0;
+
+  if (skc_subgroup_lane() == 0)
+    {
+      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
+                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
+#if 0
+      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
+#endif
+    }
+
+  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
+  *blocks      = bp_ids[bp_idx];
+  *blocks_next = 0;
+}
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_blocks_get_next(skc_uint                           * const blocks_next,
+                    skc_block_id_v_t                   * const blocks,
+                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                    __global skc_block_id_t   const    * const bp_ids)
+{
+  // replenish?
+  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
+    {
+      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
+  //
+  // SIMT
+  //
+  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
+
+#else
+  //
+  // SIMD
+  //
+  skc_block_id_t id = blocks->s0;
+
+  skc_shuffle_down_1(*blocks);
+
+#endif
+
+  *blocks_next += 1;
+
+  return id;
+}
+
+//
+// subblock allocator
+//
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+
+static
+skc_block_id_t
+skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
+                       skc_uint                           * const blocks_next,
+                       skc_block_id_v_t                   * const blocks,
+                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                       __global skc_block_id_t   const    * const bp_ids)
+{
+  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
+    }
+
+  skc_block_id_t const sb_id = *subblocks;
+
+  *subblocks += 1;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("= %u\n",sb_id);
+#endif
+
+  return sb_id;
+}
+
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks
+
+#else
+
+#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
+#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks
+
+#endif
+
+//
+//
+//
+
+static
+skc_block_id_t
+skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
+                  skc_uint                           * const blocks_next,
+                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
+                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
+                  __global skc_block_id_t   const    * const bp_ids,
+                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
+                  skc_ttsk_v_t                       * const sk_v,
+                  skc_uint                           * const sk_v_next,
+                  __global skc_ttsk_s_t              * const sk_extent,
+                  skc_uint                             const new_yx)
+{
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
+                                                       blocks_next,
+                                                       blocks,
+                                                       bp_atomics,
+                                                       bp_mask,
+                                                       bp_ids);
+#else
+  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
+                                                    blocks,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids);
+#endif
+
+  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
+    {
+      sk_v->lo = new_id;
+      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
+#if 0
+      printf("@ ( %3u, %3u ) %u\n",
+             (new_yx >> 12) & 0xFFF,
+             (new_yx      ) & 0xFFF,
+             new_id);
+#endif
+    }
+
+  *sk_v_next += 1;
+
+  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
+    {
+      *sk_v_next = 0;
+
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
+#if 0
+          printf("+ %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
+      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
+#endif
+        {
+          sk_extent[sk_idx] = *sk_v;
+#if 0
+          printf("> %u : %v2u\n",sk_idx,*sk_v);
+#endif
+        }
+    }
+
+  return new_id;
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  //  0 +
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  //  012 +
+  // ----
+  // 0123
+  //   01 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  //  0123456 +
+  // --------
+  // 01234567
+  //   012345 +
+  // --------
+  // 01234567
+  //     0123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  //  0123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  //   0123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  //     0123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  //         01234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
+  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
+  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
+  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_add(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // Note that there isn't a built-in horizontal scan for vectors so
+  // we'll define some here for various widths.
+  //
+  // FIXME -- a scalar version might be faster so put in a
+  // compile-time switch to selection between implementations
+  //
+
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  // 01
+  // 00 max
+  // --
+  // 01
+  SKC_RASTERIZE_UINT const w = max(v.s00,v);
+  return w;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  // 0123
+  // 0012 +
+  // ----
+  // 0123
+  // 0101 +
+  // ----
+  // 0123
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
+  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
+  return x;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  // 01234567
+  // 00123456 +
+  // --------
+  // 01234567
+  // 01012345 +
+  // --------
+  // 01234567
+  // 01230123 +
+  // --------
+  // 01234567
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
+  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
+  return y;
+
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  // 0123456789abcdef
+  // 00123456789abcde +
+  // ----------------
+  // 0123456789abcdef
+  // 010123456789abcd +
+  // ----------------
+  // 0123456789abcdef
+  // 01230123456789ab +
+  // ----------------
+  // 0123456789abcdef
+  // 0123456701234567 +
+  // ----------------
+  // 0123456789abcdef
+  //
+  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
+  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
+  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
+  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
+  return z;
+
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  return sub_group_scan_inclusive_max(v);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_UINT
+skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return v.s1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return v.s3;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return v.s7;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return v.sf;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+float
+skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return v.s0;
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return sub_group_broadcast(v,0);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
+                      SKC_RASTERIZE_UINT  const i)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return v;
+#else
+  return shuffle(v,i);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle(v,i);
+
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
+                          SKC_RASTERIZE_FLOAT const c) // current
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  // FIXME -- there are alternative formulations here:
+  //
+  // Option 1:
+  //
+  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
+  //
+  // Option 2:
+  //
+  //   p is a scalar
+  //   t    = c.rotate(+1)
+  //   t.s0 = p;
+  //
+  // Option 3: ...
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return p;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return shuffle2(p,c,(uint2)(1,2));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return shuffle2(p,c,(uint4)(3,4,5,6));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return intel_sub_group_shuffle_up(p,c,1);
+
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_is_lane_first()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+  //
+  // SIMD
+  //
+  return true;
+#else
+  //
+  // SIMT
+  //
+  return get_sub_group_local_id() == 0;
+#endif
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_delta_offset()
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+  return 1;
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
+#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
+  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
+#endif
+
+#else
+  //
+  // SIMT
+  //
+  return 1.0f + get_sub_group_local_id();
+
+#endif
+
+}
+
+//
+//
+//
+
+static
+int
+skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return any(p);
+#else
+  //
+  // SIMT
+  //
+  return sub_group_any(p);
+#endif
+}
+
+//
+//
+//
+
+#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)
+
+void
+skc_segment_next(__global union skc_bp_elem * const bp_elems,
+                 skc_uint                   * const nodeword,
+                 skc_block_id_t             * const id)
+{
+  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+    {
+      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
+        {
+          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
+        }
+
+      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;
+
+      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+    }
+}
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
+{
+  return native_sqrt(x * x + y * y);
+}
+
+//
+// Wang's Formula (1985)
+//
+
+#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned
+
+#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)
+
+#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
+#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))
+
+#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
+#define SKC_WANG_SQRT(x)      native_sqrt(x)
+
+//
+//
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
+                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
+{
+  //
+  // Return the number of evenly spaced (in the parametric sense) line
+  // segments that are guaranteed to be within "epsilon" error of the
+  // curve.
+  //
+  // We're then going to take multiples of the reciprocal of this
+  // number so that the segmentation can be distributed across the
+  // subgroup.
+  //
+  // Note, this can probably be slightly optimized per architecture
+  // but it's probably far from being a hotspot since it's all
+  // straight-line unpredicated code.
+  //
+  // The result is an integer ranging from [1.0,#segments]
+  //
+  // Note that even if all of the control points are coincident, the
+  // max(1.0f) will categorize this as a line of 1 segment.
+  //
+  // This is what we want!  We want to convert cubics to lines as
+  // easily as possible and *then* cull lines that are either
+  // horizontal or zero length.
+  //
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
+                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
+                                                    fabs(t3x - 2.0f * t2x + t1x)),
+                                                max(fabs(t2y - 2.0f * t1y + t0y),
+                                                    fabs(t3y - 2.0f * t2y + t1y))))));
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
+                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
+                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
+{
+  return max(1.0f,
+             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
+                                SKC_WANG_LENGTH(fabs(t2x - 2.0f * t1x + t0x),
+                                                fabs(t2y - 2.0f * t1y + t0y)))));
+}
+
+//
+// rational curves
+//
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_cubic_rat()
+{
+  return 0.0f;
+}
+
+static
+SKC_RASTERIZE_FLOAT
+skc_wangs_formula_quad_rat()
+{
+  return 0.0f;
+}
+
+//
+// flush any work-in-progress blocks and return unused block ids
+//
+
+static
+void
+skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+             __global union skc_bp_elem                 * const bp_elems,
+             __global uint                              * const bp_ids,
+             skc_uint                                     const bp_mask,
+             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+             skc_block_id_v_t                           * const blocks,
+             skc_uint                                     const blocks_next,
+             skc_ttsk_v_t                               * const sk_v,
+             skc_uint                                     const sk_v_next,
+             __global skc_ttsk_s_t                      * const sk_extent,
+             __local  struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // flush non-empty bins
+  //
+  // FIXME -- accelerate this iteration/search with a subgroup operation
+  //
+  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
+    {
+      if (smem->bin.aN.count[ii] > 0)
+        {
+          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
+          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
+#if 0
+          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
+#endif
+          bp_elems[idx].u32 = tts;
+        }
+
+      //
+      // FIXME -- vectorize with vstoreN()
+      //
+    }
+
+  //
+  // return remaining block ids back to the pool
+  //
+  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;
+
+  if (blocks_rem > 0)
+    {
+      skc_uint bp_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);
+
+#if 0
+          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
+#endif
+        }
+
+      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;
+
+      if (skc_subgroup_lane() >= blocks_next)
+        {
+          bp_ids[bp_idx] = *blocks;
+        }
+    }
+
+  //
+  // flush work-in-progress ryx keys
+  //
+  if (sk_v_next > 0)
+    {
+      skc_uint sk_idx = 0;
+
+      if (skc_subgroup_lane() == 0)
+        {
+          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
+#if 0
+          printf("* %u\n",sk_idx);
+#endif
+        }
+
+      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();
+
+      if (skc_subgroup_lane() < sk_v_next)
+        {
+          sk_extent[sk_idx] = *sk_v;
+        }
+    }
+}
+
+//
+// If there are lanes that were unable to append to a bin because
+// their hashes collided with a bin's current ryx key then those bins
+// must be ejected.
+//
+// Note that we do not eject "full" bins because lazily waiting for a
+// collision results in simpler code.
+//
+
+static
+void
+skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+          __global union skc_bp_elem                 * const bp_elems,
+          __global uint                              * const bp_ids,
+          skc_uint                                     const bp_mask,
+          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+          skc_block_id_t                             * const subblocks,
+          skc_block_id_v_t                           * const blocks,
+          skc_uint                                   * const blocks_next,
+          skc_ttsk_v_t                               * const sk_v,
+          skc_uint                                   * const sk_v_next,
+          __global skc_ttsk_s_t                      * const sk_extent,
+          __local  struct skc_subgroup_smem volatile * const smem,
+          SKC_RASTERIZE_UINT                           const hash,
+          SKC_RASTERIZE_UINT                           const yx,
+          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
+{
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+
+  //
+  // FIXME -- this code is now stale with the changes to the
+  // subblock/block allocation strategy
+  //
+
+  //
+  // get local TTSB ID queue count
+  //
+  skc_uint ttsb_id_count  = smem->pool.count; // scalar
+
+  // init hash bit mask
+  skc_uint component_mask = 0;
+
+  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
+    {
+      // if no collision continue
+      if (((int*)&is_collision)[cc] == 0)
+        continue;
+
+      uint const winner        = ((uint*)&hash)[cc];
+      uint const component_bit = 1u << winner;
+
+      // if already processed this hash then continue
+      if (component_mask & component_bit)
+        continue;
+
+      // update component mask
+      component_mask |= component_bit;
+
+      //
+      // new winner requires ejecting the old TTSB
+      //
+      if (smem->bin.aN.count[winner] > 0)
+        {
+          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+        }
+
+        //
+        // ensure there is at least one TTSK and TTSB ID
+        //
+        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
+          {
+            //
+            // update remaining count
+            //
+            ttsb_id_count = 0;
+
+            //
+            // flush accumulated ttsk_ryx keys
+            //
+            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
+              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count
+
+#if 0
+            printf("# %u\n",idx);
+#endif
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              {
+                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
+              }
+
+            //
+            // allocate more ttsb ids from pool
+            //
+            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads
+
+            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
+              smem->pool.aN.id[ii] = bp_ids[id + ii];
+          }
+
+      //
+      // invalidate the winning block
+      //
+
+      //
+      // update bin with winning yx, new ttsb id and zero count
+      //
+      // all lanes are loading/storing from/to the same index
+      //
+      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
+      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
+      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
+      smem->bin.aN.count[winner] = 0;
+
+      //
+      // update count
+      //
+      ttsb_id_count += 1;
+    }
+
+  //
+  // save count
+  //
+  smem->pool.count = ttsb_id_count;
+
+#else
+  //
+  // SIMT
+  //
+
+  do {
+    //
+    // only one lane will win!
+    //
+    if (is_collision)
+      smem->subgroup.winner = hash;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // which bin is being ejected?
+    //
+    skc_uint const winner = smem->subgroup.winner;
+
+    //
+    // which colliding hash is taking over the bin?
+    //
+    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);
+
+    //
+    // all lanes with the same hash will try to store but only one
+    // lane will win
+    //
+    if (is_winner)
+      smem->subgroup.winner = yx;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //
+    // flush this block to the pool
+    //
+    if (smem->bin.aN.count[winner] > 0)
+      {
+        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
+        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
+#if 0
+        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
+#endif
+        bp_elems[idx].u32 = tts;
+      }
+
+    //
+    // append new ttsk
+    //
+    skc_uint       const new_yx = smem->subgroup.winner;
+    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
+                                                    blocks_next,
+                                                    bp_atomics,
+                                                    bp_mask, // pow2 modulo mask for block pool ring
+                                                    bp_ids,
+                                                    cohort_atomics,
+                                                    sk_v,
+                                                    sk_v_next,
+                                                    sk_extent,
+                                                    new_yx);
+
+#if 0
+    if (get_sub_group_local_id() == 0) {
+      printf(">>> %9u\n",new_id);
+    }
+#endif
+
+    //
+    // update bin with winning yx, new ttsb id and zero count
+    //
+    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
+    smem->bin.aN.yx   [winner]                      = new_yx;
+    smem->bin.aN.id   [winner]                      = new_id;
+    smem->bin.aN.count[winner]                      = 0;
+
+    //
+    // remove all lanes matching this hash
+    //
+    is_collision = is_collision && !is_winner;
+
+    //
+    // exit if nothing left to do
+    //
+  } while (sub_group_any(is_collision));
+
+#endif
+}
+
+//
+// scatter scan max
+//
+static
+SKC_RASTERIZE_UINT
+skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
+                     SKC_RASTERIZE_FLOAT                         const iss,
+                     SKC_RASTERIZE_FLOAT                         const ess)
+{
+  //
+  // prefix sums determine which lanes we're going to work on next
+  //
+  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
+  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
+  //
+  // SIMD APPROACH 1: SIMT'ISH
+  //
+
+  // zero the volatile smem scratchpad using vector syntax
+  smem->subgroup.vN.scratch[0] = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                         \
+  if (is_scratch_store C)                               \
+    smem->subgroup.aN.scratch[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  // propagate lanes to right using max scan
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+
+#else
+  //
+  // SIMD APPROACH 2: SCALAR'ISH
+  //
+
+  SKC_RASTERIZE_UINT source = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                 \
+  if (is_scratch_store C)                       \
+    ((uint *)&source)[scratch_idx C] = I;
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
+    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
+#endif
+
+#else
+  //
+  // SIMT
+  //
+
+  //
+  // zero the volatile smem scratchpad using vector syntax
+  //
+  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );
+
+  //
+  // store source lane at starting lane
+  //
+  if (is_scratch_store)
+    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();
+
+  //
+  // propagate lanes to right using max scan
+  //
+  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
+  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
+#endif
+
+  return source;
+}
+
+//
+// sliver lines into subpixels
+//
+
+static
+void
+skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
+           __global union skc_bp_elem                 * const bp_elems,
+           __global uint                              * const bp_ids,
+           skc_uint                                     const bp_mask,
+           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
+           skc_block_id_t                             * const subblocks,
+           skc_block_id_v_t                           * const blocks,
+           skc_uint                                   * const blocks_next,
+           skc_ttsk_v_t                               * const sk_v,
+           skc_uint                                   * const sk_v_next,
+           __global skc_ttsk_s_t                      * const sk_extent,
+           __local  struct skc_subgroup_smem volatile * const smem,
+           SKC_RASTERIZE_FLOAT                          const l0x,
+           SKC_RASTERIZE_FLOAT                          const l0y,
+           SKC_RASTERIZE_FLOAT                          const l1x,
+           SKC_RASTERIZE_FLOAT                          const l1y)
+{
+  //
+  // Y-SLIVERING
+  // -----------
+  //
+  // immediately sliver all multi-pixel lines in into 1-pixel high
+  // lines
+  //
+  // note this implicitly squelches horizontal lines
+  //
+  // there is another test for horizontal lines after x-slivering
+  // is complete
+  //
+
+  //
+  // will we need to flip the sign of y_delta ?
+  //
+  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
+  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;
+
+  //
+  // save 1/dy
+  //
+  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);
+
+  //
+  // how many non-horizontal subpixel y-axis slivers are there?
+  //
+  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
+  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
+  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;
+
+  //
+  // inclusive subgroup scan of y_segs
+  //
+  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
+  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
+  float                     y_rem   = skc_subgroup_last_float(y_iss);
+
+  //
+  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
+  //
+  if (y_segs == 0.0f)
+    y_iss = 0.0f;
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
+#endif
+
+  //
+  // these values don't matter on first iteration
+  //
+  SKC_RASTERIZE_FLOAT n1x_prev = 0;
+  SKC_RASTERIZE_FLOAT n1y_prev = 0;
+
+  //
+  // loop until done
+  //
+  while (y_rem > 0.0f)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);
+
+      //
+      // get line at y_source line
+      //
+      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
+      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
+      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
+      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);
+
+      //
+      // every lane will create a 1 pixel tall line "sliver"
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if numerator == 1 then this is the first lane
+      // if numerator == s then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
+      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);
+
+      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);
+
+      // toggle y_delta sign
+      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));
+
+      //
+      // calculate "right" line segment endpoint
+      //
+      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
+      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
+      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));
+
+      //
+      // override c1 if this is last point
+      //
+      n1y = select(n1y,m1y,is_y_last);
+      n1x = select(n1x,m1x,is_y_last);
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
+      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      n0y = select(n0y,m0y,is_y_first);
+      n0x = select(n0x,m0x,is_y_first);
+
+      //
+      // save previous right endpoint
+      //
+      n1x_prev = n1x;
+      n1y_prev = n1y;
+
+      //
+      // decrement by subgroup size
+      //
+      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+#if 0
+      //
+      // debug
+      //
+      if (n0y != n1y) {
+        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
+      }
+#endif
+
+      //
+      // X-SLIVERING
+      // -----------
+      //
+      // now sliver 1-pixel high lines into at either vertical or
+      // 1-pixel wide lines
+      //
+      // save original direction and work with increasing x
+      //
+      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
+      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;
+
+      //
+      // save 1/dy
+      //
+      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);
+
+      //
+      // how many non-horizontal subpixel y-axis slivers are there?
+      //
+      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
+      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
+      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);
+
+      //
+      // inclusive subgroup scan of y_segs
+      //
+      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
+      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
+      float                     x_rem    = skc_subgroup_last_float(x_iss);
+
+      //
+      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
+      //
+      //if (x_segs == 0.0f)
+      // x_iss = 0.0f;
+
+      //
+      // these values don't matter on first iteration
+      //
+      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
+      SKC_RASTERIZE_FLOAT       p1y_prev = 0;
+
+      //
+      // loop until done
+      //
+      while (x_rem > 0)
+        {
+          //
+          // distribute work across lanes
+          //
+          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);
+
+          //
+          // get line at y_source line
+          //
+          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
+          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
+          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
+          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);
+
+          //
+          // every lane will create a 1 pixel tall line "sliver"
+          //
+          // FIXME -- this gets expanded on SIMD
+          //
+          // if numerator == 1 then this is the first lane
+          // if numerator == s then this is the last  lane
+          //
+          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
+          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);
+
+          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
+          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);
+
+          // toggle x_delta sign
+          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));
+
+          //
+          // calculate "right" line segment endpoint
+          //
+          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
+          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
+          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));
+
+          //
+          // override c1 if this is last point
+          //
+          p1x = select(p1x,o1x,is_x_last);
+          p1y = select(p1y,o1y,is_x_last);
+
+          //
+          // shuffle up "left" line segment endpoint
+          //
+          // NOTE: Intel's shuffle_up is unique with its elegant
+          // "previous" argument so don't get used to it
+          //
+          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
+          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);
+
+          //
+          // override shuffle up if this is the first line segment
+          //
+          p0x = select(p0x,o0x,is_x_first);
+          p0y = select(p0y,o0y,is_x_first);
+
+          //
+          // save previous right endpoint
+          //
+          p1x_prev = p1x;
+          p1y_prev = p1y;
+
+          //
+          // decrement by subgroup size
+          //
+          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+          //
+          // only non-horizontal subpixel lines are valid
+          //
+          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);
+
+          //
+          // if no lanes are active then continue
+          //
+          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
+          // IMPACTS PERFORMANCE (+12% ?)
+          //
+          // IT SHOULDN'T !!!
+          //
+#if 0
+          if (!skc_subgroup_any(is_active))
+            continue;
+#endif
+
+          //
+          // Option 1: use SLM for explicitly managed coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then SLM append keys
+          // 5. if tile doesn't match
+          //   a. flush
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
+          //
+          // 1. which tile does this line belong?
+          // 2. hash tile coordinates
+          // 3. lookup hash
+          // 4. if tile matches then GMEM append keys
+          // 5. if tile doesn't match
+          //   a. flush (and invalidate empty elems)
+          //   b. create new TTSK_RYX
+          //   c. obtain TTSB block from pool
+          //   d. goto 3.
+          //
+
+          //
+          // The virtual rasterization surface is very large and
+          // signed: +/- ~64K-256K, depending on the architecture.
+          //
+          // Rasters must be clipped to the virtual surface and,
+          // optionally, clipped even further on a per raster
+          // basis.
+          //
+
+          //
+          // Clip to the per-raster clip
+          //
+
+          /*
+
+            CLIP HERE
+
+          */
+
+          //
+          // Hash the tile coordinates
+          //
+          // This table lists nominal values for each architecture.
+          // We want to choose values that are naturally fit the
+          // "width" of the architecture.
+          //
+          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
+          //   ----  -------  ----  ---------  --------  ---------
+          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
+          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
+          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
+          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
+          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
+          //
+          // NOTE: When possible, bias the hash toward using more y
+          // bits because of:
+          //
+          //   1. the 90 degree counter-clockwise rotation that we put
+          //      in place to offset the render-time clockwise
+          //      rotation
+          //
+          //   2. the likely presence of left-to-right or
+          //      right-to-left glyphs.
+          //
+          // For power-of-two bins, the hash is easy.
+          //
+          // For non-power-of-two, we may want to either implement a
+          // fast mod (compiler should do this for us... hahahaha) or
+          // drop down to the next power-of-two.
+          //
+
+          //
+          // FIXME -- this snarl is not good -- can probably reduce
+          // some of the sign casting but some is there to vectorize a
+          // scalar
+          //
+          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
+          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);
+
+          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
+          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);
+
+          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
+          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);
+
+          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;
+
+          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
+          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);
+
+          //
+          // map [+1,+32] to [ 0,+31]
+          // map [-1,-32] to [-1,-32]
+          //
+          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;
+
+          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
+          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
+          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;
+
+          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
+          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);
+
+          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;
+
+          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
+                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));
+
+          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));
+
+#if 0
+          printf("(%3u, %3u)\n",tile_y,tile_x);
+#endif
+
+#if 0
+          if (is_active)
+            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
+#endif
+
+          //
+          // debug
+          //
+#if 0 // PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+          if (is_active C)                                              \
+            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);
+
+          SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+          if (is_active)
+            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
+#endif
+
+#endif
+          //
+          // flush all active lanes
+          //
+          while (true)
+            {
+              //
+              // either gather load or vector load+shuffle the yx keys
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
+              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
+#endif
+
+              //
+              // does yx for lane match yx for hash?
+              //
+              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
+              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);
+
+              //
+              // OpenCL spec: "When casting a bool to a vector integer
+              // data type, the vector components will be set to -1
+              // (i.e. all bits set) if the vector bool value is true
+              // and 0 otherwise.
+              //
+#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
+              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
+#else
+              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
+#endif
+              //
+              // how many new elements for each matching hash bin?
+              //
+              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
+              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;
+
+              //
+              // prefix sum all of the bins in parallel
+              //
+              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
+              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);
+
+              //
+              // current bin counts
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
+              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
+#else
+              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
+#endif
+
+              //
+              // calculate where each cache-hit and in-bounds tts should be stored
+              //
+              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
+              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;
+
+              //
+              // which lanes can append to a matching bin?
+              //
+              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);
+
+              //
+              // scatter append tts elements to bin blocks
+              //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
+              //
+              // SIMD
+              //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+              if (is_append C)                                          \
+                {                                                       \
+                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
+                  smem->bin.aN.count[hash C]               = count_new C; \
+                }
+
+              SKC_RASTERIZE_VECTOR_EXPAND();
+#else
+              //
+              // SIMT
+              //
+              if (is_append)
+                {
+                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
+                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
+                }
+#endif
+              //
+              // try to keep predicate updates SIMD-friendly and
+              // outside of predicated code paths -- this is not
+              // always how we would normally do things on SIMT but
+              // either approach is acceptable
+              //
+
+              //
+              // mask off lanes/components that successfully appended
+              //
+              is_active = is_active && !is_append;
+
+              //
+              // are there any active lanes left?
+              //
+              if (!skc_subgroup_any(is_active))
+                break;
+
+              //
+              // There are active lanes that couldn't be appended to a
+              // bin because their hashes collided with the bin's
+              // current ryx key then those bins must be ejected.
+              //
+              // Note that we do not eject "full" bins because lazily
+              // waiting for a collision results in simpler code.
+              //
+              skc_flush(bp_atomics,
+                        bp_elems,
+                        bp_ids,
+                        bp_mask,
+                        cohort_atomics,
+                        subblocks,
+                        blocks,
+                        blocks_next,
+                        sk_v,
+                        sk_v_next,
+                        sk_extent,
+                        smem,
+                        hash,
+                        yx,
+                        is_active);
+            }
+        }
+    }
+}
+
+//
+// INITIALIZE SMEM
+//
+// Note that SIMD/SIMT have nearly the same syntax.
+//
+static
+void
+skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
+{
+  //
+  // initialize smem bins
+  //
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
+  smem->bin.vN.count = ( 0 );
+#else
+  //
+  // SIMT
+  //
+  int idx = skc_subgroup_lane();
+
+#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
+#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
+  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
+#endif
+    {
+      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
+      smem->bin.aN.count[idx] = ( 0 );
+    }
+#endif
+}
+
+//
+// RASTERIZE CUBIC KERNEL
+//
+
+static
+void
+skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                     __global union skc_bp_elem                * const bp_elems,
+                     __global uint                             * const bp_ids,
+                     skc_uint                                    const bp_mask,
+
+                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                     __global skc_ttsk_s_t                     * const sk_extent,
+
+                     __local struct skc_subgroup_smem volatile * const smem,
+
+                     skc_uint                                  * const nodeword,
+                     skc_block_id_t                            * const id,
+
+                     union skc_transform              const    * const tv,
+                     union skc_path_clip              const    * const cv,
+                     skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t3x = round(c3x * tv->sx  + c3y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t3y = round(c3x * tv->shy + c3y * tv->sy  + tv->ty);
+
+  //
+  //
+  //
+#if PRINTF_ENABLE
+
+#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
+         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
+         b0x C,b0y C,t1x C,t1y C,                                       \
+         t2x C,t2y C,t3x C,t3y C);
+
+  SKC_RASTERIZE_VECTOR_EXPAND();
+
+#else
+
+  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
+         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+#endif
+
+#endif
+
+  //
+  // OLD APPROACH
+  // ------------
+  //
+  // The Spinel CUDA rasterizer was significantly more complex and
+  // performed a few different tasks that are probably best kept
+  // separate.
+  //
+  // The Spinel rasterizer Bezier held 4-element x and y coordinates
+  // in adjacent lanes. This simplified intermingling of single lane
+  // 4-coordinate line segments with two-lane cubic Beziers.
+  //
+  // After transformation of the input segments, the Spinel rasterizer
+  // would test cubics for flatness and, if flat, collapse the
+  // adjacent lanes into a single line lane and an empty lane.
+  //
+  // Any lines would then be appended to a line queue.
+  //
+  // Any cubics would then be subdivided.
+  //
+  // The reclassification process would be repeated.
+  //
+  // NEW APPROACH
+  // ------------
+  //
+  // Assume we're only working with cubics in this kernel.
+  //
+  // Optimization: if the line segment is a special case -- a cusp,
+  // has 1+ inflections, or a loop -- it might be beneficial to
+  // subdivide the control cage 1+ times in order to separate the
+  // flatter segments the high-velocity region(s).
+  //
+  // This means we want to split using [a,b] formulation to _directly_
+  // subdivide producing a new control cage.
+  //
+  // Wang's Formula is still useful even if we subdivide once or twice
+  // as it's so cheap that it might give some useful hints about where
+  // the high-velocity sections of curve reside.
+  //
+  // But it seems like using Wang's and directly flattening to line
+  // segments without any subdivision is good enough for the limited
+  // set of test cases that I've tried.
+  //
+  // So... use Wang's Formula to estimate how many line segment are
+  // required to properly flatten the cubics.
+  //
+  // Then use inclusive/exclusive scans to put all the lanes to work:
+  //
+  //   1. segmenting cubics to line segments
+  //
+  //   2. slivering line segments into 1-pixel high line segments
+  //
+  //   3. slivering 1-pixel high line segments into 1-pixel wide line
+  //      segments
+  //
+  // MORE BACKGROUND ON NEW APPROACH
+  // -------------------------------
+  //
+  // Two options for handling line segments:
+  //
+  // 1. append the line segments onto an SLM array until enough
+  //    work has been accrued (Spinel does this)
+  //
+  // 2. immediately sliver the potentially multi-pixel line
+  //    segments into subpixel lines
+  //
+  // The advantage of (1) is that it guarantees the slivering
+  // process will, on average, always be emitting a full subgroup
+  // of subpixel lines.
+  //
+  // The advantage of (2) is that it reduces code complexity and
+  // leaves more room for SLM tile bins. The difference between Spinel
+  // and Skia Compute is that Wang's Formula guarantees there will be
+  // a full subgroup of multi-pixel lines unless this is the final
+  // iteration of the warp of multi-pixel lines.
+  //
+  // Note that wider GPU architectures might benefit from (1) and
+  // other work accumulation strategies because it will minimize
+  // partial warp workloads in the final iteration of each stage.  It
+  // also minimizes the sunk cost of the uniform control logic steps.
+  //
+  // So let's implement (2) for now...
+  //
+
+  //
+  // And... begin!
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute cubic polynomial coefficients from transformed control
+  // cage so we can shuffle them in on each iteration of the outer
+  // loop and then evaluate the polynomial in Horner form.
+  //
+  //                            |  1  0  0  0 | | c0 |
+  //                            |             | |    |
+  //                            | -3  3  0  0 | | c1 |
+  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
+  //                            |  3 -6  3  0 | | c2 |
+  //                            |             | |    |
+  //                            | -1  3 -3  1 | | c3 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
+  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
+      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE QUAD KERNEL
+//
+
+static
+void
+skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only support perspective later
+  //
+  // the affine transformation requires 8 FMA + 2 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const b0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const b0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
+  SKC_RASTERIZE_FLOAT const t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;
+
+  SKC_RASTERIZE_FLOAT const t2x = round(c2x * tv->sx  + c2y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const t2y = round(c2x * tv->shy + c2y * tv->sy  + tv->ty);
+
+  //
+  // Estimate how many line segments are in quad/cubic curve.
+  //
+  // Wang's Formula will return zero if the control points are
+  // collinear but we bump it up to 1.0f.
+  //
+  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);
+
+  //
+  // if there are free registers then precalculate the reciprocal for
+  // each estimated segments since it will never change
+  //
+  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);
+
+
+  //
+  // inclusive add scan of estimated line segments
+  // exclusive add scan of estimated line segments
+  // total number       of estimated line segments
+  //
+  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
+  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
+  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar
+
+  //
+  // Precompute quadratic polynomial coefficients from control cage so
+  // we can shuffle them in on each iteration of the outer loop and
+  // then evaluate the polynomial in Horner form.
+  //
+
+  //                        |  1  0  0  | | c0 |
+  //                        |           | |    |
+  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
+  //                        |           | |    |
+  //                        |  1 -2  1  | | c2 |
+  //
+  //
+  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
+  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL
+
+  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
+  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD
+
+  //
+  // these values don't matter on the first iteration
+  //
+  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
+  SKC_RASTERIZE_FLOAT l1y_prev  = 0;
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // loop until done
+  //
+  while (s_rem > 0)
+    {
+      //
+      // distribute work across lanes
+      //
+      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);
+
+      //
+      // every lane has a fraction to work off of
+      //
+      // FIXME -- this gets expanded on SIMD
+      //
+      // if delta == 1      then this is the first lane
+      // if count == s_segs then this is the last  lane
+      //
+      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
+      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);
+
+      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
+      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);
+
+      //
+      // init parametric t
+      //
+      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?
+
+      //
+      // if last then override to a hard 1.0f
+      //
+      s_t    = is_s_last ? 1.0f : s_t;
+
+      //
+      // decrement by subgroup size
+      //
+      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
+
+      //
+      // now every lane knows what to do and the following lines will
+      // pump out up to SUBGROUP_SIZE line segments
+      //
+      // obtain the src vertices through shared or via a shuffle
+      //
+
+      //
+      // shuffle in the polynomial coefficients their source lane
+      //
+      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
+      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
+      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);
+
+      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
+      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);
+
+      //
+      // calculate "right" line segment endpoint using Horner form
+      //
+      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
+      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND
+
+      //
+      // shuffle up "left" line segment endpoint
+      //
+      // NOTE: Intel's shuffle_up is unique with its elegant
+      // "previous" argument so don't get used to it
+      //
+      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
+      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);
+
+      //
+      // save previous right endpoint
+      //
+      l1x_prev = l1x;
+      l1y_prev = l1y;
+
+      //
+      // override shuffle up if this is the first line segment
+      //
+      l0x = select(l0x,s0x,is_s_first);
+      l0y = select(l0y,s0y,is_s_first);
+
+      //
+      // sliver lines
+      //
+      skc_sliver(bp_atomics,
+                 bp_elems,
+                 bp_ids,
+                 bp_mask,
+                 cohort_atomics,
+                 &subblocks,
+                 &blocks,
+                 &blocks_next,
+                 &sk_v,
+                 &sk_v_next,
+                 sk_extent,
+                 smem,
+                 l0x,l0y,l1x,l1y);
+    }
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+// RASTERIZE LINE KERNEL
+//
+
+static
+void
+skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                    __global union skc_bp_elem                * const bp_elems,
+                    __global uint                             * const bp_ids,
+                    skc_uint                                    const bp_mask,
+
+                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                    __global skc_ttsk_s_t                     * const sk_extent,
+
+                    __local struct skc_subgroup_smem volatile * const smem,
+                    
+                    skc_uint                                  * const nodeword,
+                    skc_block_id_t                            * const id,
+
+                    union skc_transform              const    * const tv,
+                    union skc_path_clip              const    * const cv,
+                    skc_uint                                    const cohort)
+{
+  //
+  // the initial segment idx and segments-per-block constant determine
+  // how many block ids will need to be loaded
+  //
+  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+  skc_segment_next(bp_elems,nodeword,id);
+
+  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;
+
+#if 0
+  // printf("%5u : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",(skc_uint)get_global_id(0),c0x,c0y,c1x,c1y);
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
+#endif
+
+  //
+  // apply transform
+  //
+  // note that we only care if the end points are rounded to subpixel precision
+  //
+  // FIXME -- transformation is currently affine-only
+  // FIXME -- support perspective later
+  //
+  // the affine transformation requires 8 FMA + 4 ROUND operations
+  //
+  SKC_RASTERIZE_FLOAT const l0x = round(c0x * tv->sx  + c0y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l0y = round(c0x * tv->shy + c0y * tv->sy  + tv->ty);
+
+  SKC_RASTERIZE_FLOAT const l1x = round(c1x * tv->sx  + c1y * tv->shx + tv->tx);
+  SKC_RASTERIZE_FLOAT const l1y = round(c1x * tv->shy + c1y * tv->sy  + tv->ty);
+
+#if 0
+  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
+#endif
+
+  //
+  // allocate and init in-register TTSK keys
+  //
+  skc_uint     sk_v_next = 0;
+  skc_ttsk_v_t sk_v; 
+
+  sk_v.hi = cohort;
+
+  //
+  // initialize smem
+  //
+  skc_smem_init(smem);
+
+  //
+  // initialize blocks / subblocks
+  //
+  skc_block_id_v_t blocks;
+  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;
+
+#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
+  skc_block_id_t   subblocks   = 0;
+#endif
+
+  //
+  // sliver lines
+  //
+  skc_sliver(bp_atomics,
+             bp_elems,
+             bp_ids,
+             bp_mask,
+             cohort_atomics,
+             &subblocks,
+             &blocks,
+             &blocks_next,
+             &sk_v,
+             &sk_v_next,
+             sk_extent,
+             smem,
+             l0x,l0y,l1x,l1y);
+
+  //
+  // - flush work-in-progress blocks
+  // - return unused block ids
+  //
+  skc_finalize(bp_atomics,
+               bp_elems,
+               bp_ids,
+               bp_mask,
+               cohort_atomics,
+               &blocks,
+               blocks_next,
+               &sk_v,
+               sk_v_next,
+               sk_extent,
+               smem);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                         __global union skc_bp_elem                * const bp_elems,
+                         __global uint                             * const bp_ids,
+                         skc_uint                                    const bp_mask,
+
+                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                         __global skc_ttsk_s_t                     * const sk_extent,
+
+                         __global float8                  const    * const transforms, // FIXME -- __constant
+                         __global float4                  const    * const clips,      // FIXME -- __constant
+                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                         skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("+cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("-cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("[ %u ]< %u, %u, %u, %u >\n",
+           cmd_idx,
+           cmd.nodeword,
+           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
+           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
+           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
+#endif
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  switch (tag)
+    {
+    case SKC_BLOCK_ID_TAG_PATH_LINE:
+      skc_rasterize_lines(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_QUAD:
+      skc_rasterize_quads(bp_atomics,
+                          bp_elems,
+                          bp_ids,
+                          bp_mask,
+                          cohort_atomics,
+                          sk_extent,
+                          smem,
+                          &nodeword,&id,
+                          &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
+      skc_rasterize_cubics(bp_atomics,
+                           bp_elems,
+                           bp_ids,
+                           bp_mask,
+                           cohort_atomics,
+                           sk_extent,
+                           smem,
+                           &nodeword,&id,
+                           &tv,&cv,cohort);
+      break;
+
+    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
+      break;
+    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
+      break;
+
+    default:
+      break;
+    }
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_lines(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                           __global union skc_bp_elem                * const bp_elems,
+                           __global uint                             * const bp_ids,
+                           skc_uint                                    const bp_mask,
+
+                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                           __global skc_ttsk_s_t                     * const sk_extent,
+
+                           __global float8                  const    * const transforms, // FIXME -- __constant
+                           __global float4                  const    * const clips,      // FIXME -- __constant
+                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                           skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_quads(bp_atomics,
+                      bp_elems,
+                      bp_ids,
+                      bp_mask,
+                      cohort_atomics,
+                      sk_extent,
+                      smem,
+                      &nodeword,&id,
+                      &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                            __global union skc_bp_elem                * const bp_elems,
+                            __global uint                             * const bp_ids,
+                            skc_uint                                    const bp_mask,
+
+                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                            __global skc_ttsk_s_t                     * const sk_extent,
+
+                            __global float8                  const    * const transforms, // FIXME -- __constant
+                            __global float4                  const    * const clips,      // FIXME -- __constant
+                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                            skc_uint                                    const count)
+{
+  //
+  // declare shared memory block
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  __local struct skc_subgroup_smem volatile                smem[1];
+#else
+  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
+  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
+#endif
+  
+  //
+  // this is a subgroup/warp-centric kernel
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
+  // get_group_id(0) as a uniform but the alternative calculation used
+  // when there are multiple subgroups per workgroup is not
+  // cooperating and driving spillage elsewhere.
+  //
+#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
+  uint const cmd_idx = get_group_id(0);
+#else
+  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // if worksgroups are multi-subgroup then there may be excess
+  // subgroups in the final workgroup
+  //
+  if (cmd_idx >= count)
+    return;
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("cmd_idx = %u\n",cmd_idx);
+#endif
+
+  //
+  // load a single command for this subgroup
+  //
+  union skc_cmd_rasterize const cmd = cmds[cmd_idx];
+
+  //
+  // get first block node command word and its subblock
+  //
+  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
+  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
+  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
+
+  //
+  // load transform -- uniform across subgroup
+  //
+  // v8: { sx shx tx shy sy ty w0 w1 }
+  //
+  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
+  //
+  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
+  //
+  // Coordinates are scaled to subpixel resolution.  All that matters
+  // is that continuity is maintained between end path element
+  // endpoints.
+  //
+  // It's the responsibility of the host to ensure that the transforms
+  // are properly scaled either via intitializing a transform stack
+  // with the subpixel resolution scaled identity or scaling the
+  // transform before its loaded by a rasterization grid.
+  //
+  // FIXME -- horizontal load might be better than this broadcast load
+  //
+  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
+  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
+  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted
+
+  skc_rasterize_cubics(bp_atomics,
+                       bp_elems,
+                       bp_ids,
+                       bp_mask,
+                       cohort_atomics,
+                       sk_extent,
+                       smem,
+                       &nodeword,&id,
+                       &tv,&cv,cohort);
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                               __global union skc_bp_elem                * const bp_elems,
+                               __global uint                             * const bp_ids,
+                               skc_uint                                    const bp_mask,
+
+                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                               __global skc_ttsk_s_t                     * const sk_extent,
+
+                               __global float8                  const    * const transforms, // FIXME -- __constant
+                               __global float4                  const    * const clips,      // FIXME -- __constant
+                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                               skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//
+
+__kernel
+SKC_RASTERIZE_KERNEL_ATTRIBS
+void
+skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
+                                __global union skc_bp_elem                * const bp_elems,
+                                __global uint                             * const bp_ids,
+                                skc_uint                                    const bp_mask,
+
+                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
+                                __global skc_ttsk_s_t                     * const sk_extent,
+
+                                __global float8                  const    * const transforms, // FIXME -- __constant
+                                __global float4                  const    * const clips,      // FIXME -- __constant
+                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
+                                skc_uint                                    const count)
+{
+  ;
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
index 0c7da7d..0db21de 100644
--- a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
@@ -1,144 +1,144 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "raster.h"

-#include "atomic_cl.h"

-#include "block_pool_cl.h"

-#include "raster_builder_cl_12.h"

-#include "device_cl_12.h"

-

-//

-// There is a fixed-size meta table per raster cohort that we use to

-// peform a mostly coalesced sizing and allocation of blocks.

-//

-// This code is simple and fast.

-//

-

-__kernel

-SKC_RASTERS_ALLOC_KERNEL_ATTRIBS

-void

-skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,

-                         __global skc_block_id_t  const    * const bp_ids,

-                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring

-                         __global skc_block_id_t           * const map,

-                         __global skc_uint                 * const metas,

-                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT

-                         skc_uint                            const count)

-{

-  // access to the meta extent is linear

-  skc_uint const gid       = get_global_id(0);

-  skc_bool const is_active = gid < count;

-

-  //

-  // init with defaults for all lanes

-  //

-  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };

-  skc_uint                           raster_id    = SKC_UINT_MAX;

-  skc_uint                           extra_blocks = 0;

-

-  if (is_active)

-    {

-      // load meta_in

-      meta.in.u32v4     = vload4(gid,metas);

-

-      // load raster_id as early as possible

-      raster_id         = raster_ids[gid];

-

-#if 0

-      printf("%3u + %5u, %5u, %5u, %5u\n",

-             gid,

-             meta.in.blocks,

-             meta.in.offset,

-             meta.in.pk,

-             meta.in.rk);

-#endif

-

-      // how many blocks will the ttpb blocks consume?

-      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 

-                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);

-

-      // total keys

-      meta.out.keys    += meta.in.pk;

-

-      // how many blocks do we need to store the keys in the head and trailing nodes?

-      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /

-                           (SKC_RASTER_NODE_DWORDS - 1));

-      // increment blocks

-      extra_blocks     += hn;

-

-      // how many nodes trail the head?

-      meta.out.nodes    = hn - 1;

-      

-      // update blocks

-      meta.out.blocks  += extra_blocks;

-

-#if 0

-      printf("%3u - %5u, %5u, %5u, %5u\n",

-             gid,

-             meta.out.blocks,

-             meta.out.offset,

-             meta.out.nodes,

-             meta.out.keys);

-#endif

-    }

-

-  //

-  // allocate blocks from block pool

-  //

-  // first perform a prefix sum on the subgroup to reduce atomic

-  // operation traffic

-  //

-  // note this idiom can be implemented with vectors, subgroups or

-  // workgroups

-  //

-  

-  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);

-  skc_uint       reads  = 0;

-

-  // last lane performs the block pool allocation with an atomic increment

-  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {

-    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads

-  }

-

-  // broadcast block pool base to all lanes

-  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);

-

-  // update base for each lane

-  reads += prefix - extra_blocks;

-

-  //

-  // store meta header

-  //

-  if (is_active)

-    {

-      // store headers back to meta extent

-      vstore4(meta.out.u32v4,gid,metas);

-

-      // store reads

-      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 

-

-      // get block_id of each raster head 

-      skc_block_id_t const block_id = bp_ids[reads & bp_mask];

-

-      // update map

-      map[raster_id] = block_id;

-

-#if 0

-      printf("alloc: %u / %u\n",raster_id,block_id);

-#endif

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+// There is a fixed-size meta table per raster cohort that we use to
+// peform a mostly coalesced sizing and allocation of blocks.
+//
+// This code is simple and fast.
+//
+
+__kernel
+SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+                         __global skc_block_id_t  const    * const bp_ids,
+                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t           * const map,
+                         __global skc_uint                 * const metas,
+                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT
+                         skc_uint                            const count)
+{
+  // access to the meta extent is linear
+  skc_uint const gid       = get_global_id(0);
+  skc_bool const is_active = gid < count;
+
+  //
+  // init with defaults for all lanes
+  //
+  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };
+  skc_uint                           raster_id    = SKC_UINT_MAX;
+  skc_uint                           extra_blocks = 0;
+
+  if (is_active)
+    {
+      // load meta_in
+      meta.in.u32v4     = vload4(gid,metas);
+
+      // load raster_id as early as possible
+      raster_id         = raster_ids[gid];
+
+#if 0
+      printf("%3u + %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.in.blocks,
+             meta.in.offset,
+             meta.in.pk,
+             meta.in.rk);
+#endif
+
+      // how many blocks will the ttpb blocks consume?
+      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 
+                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
+
+      // total keys
+      meta.out.keys    += meta.in.pk;
+
+      // how many blocks do we need to store the keys in the head and trailing nodes?
+      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
+                           (SKC_RASTER_NODE_DWORDS - 1));
+      // increment blocks
+      extra_blocks     += hn;
+
+      // how many nodes trail the head?
+      meta.out.nodes    = hn - 1;
+      
+      // update blocks
+      meta.out.blocks  += extra_blocks;
+
+#if 0
+      printf("%3u - %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.out.blocks,
+             meta.out.offset,
+             meta.out.nodes,
+             meta.out.keys);
+#endif
+    }
+
+  //
+  // allocate blocks from block pool
+  //
+  // first perform a prefix sum on the subgroup to reduce atomic
+  // operation traffic
+  //
+  // note this idiom can be implemented with vectors, subgroups or
+  // workgroups
+  //
+  
+  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
+  skc_uint       reads  = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
+    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
+  }
+
+  // broadcast block pool base to all lanes
+  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
+
+  // update base for each lane
+  reads += prefix - extra_blocks;
+
+  //
+  // store meta header
+  //
+  if (is_active)
+    {
+      // store headers back to meta extent
+      vstore4(meta.out.u32v4,gid,metas);
+
+      // store reads
+      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 
+
+      // get block_id of each raster head 
+      skc_block_id_t const block_id = bp_ids[reads & bp_mask];
+
+      // update map
+      map[raster_id] = block_id;
+
+#if 0
+      printf("alloc: %u / %u\n",raster_id,block_id);
+#endif
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
index 27411cf..b0eb7ea 100644
--- a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
@@ -1,442 +1,442 @@
-/*

- * Copyright 2017 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "block.h"

-#include "raster.h"

-#include "common.h"

-#include "atomic_cl.h"

-#include "block_pool_cl.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)

-

-#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)

-

-#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)

-

-//

-//

-//

-

-#if   ( SKC_RASTERS_RECLAIM_X == 1 )

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0

-

-#elif ( SKC_RASTERS_RECLAIM_X == 2 )

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1

-

-#elif ( SKC_RASTERS_RECLAIM_X == 4 )

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3

-

-#elif ( SKC_RASTERS_RECLAIM_X == 8 )

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7

-

-#elif ( SKC_RASTERS_RECLAIM_X == 16)

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()

-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15

-

-#else

-#error "MISSING SKC_RASTERS_RECLAIM_X"

-#endif

-

-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )

-

-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)

-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-

-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1

-

-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)

-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \

-                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))

-

-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)

-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-

-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1

-

-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)

-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask

-

-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))

-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)

-

-#endif

-

-//

-// FIXME -- slate these for replacement

-//

-

-#define SKC_BROADCAST(E,S,I)                                            \

-  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \

-  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)

-

-#define SKC_BROADCAST_LAST(E,I)                 \

-  SKC_BROADCAST_LAST_HELPER(E,I)

-

-//

-// COMPILE-TIME PREDICATES

-//

-

-#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \

-  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \

-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \

-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)

-

-#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \

-  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)

-

-#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \

-  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)

-

-//

-// RUN-TIME PREDICATES

-//

-

-#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \

-  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)

-

-//

-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL

-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK

-// COMBOS (NOT NECESSARILY POW2)

-//

-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR

-// UINT TYPE INSTEAD OF A ULONG.

-//

-

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint

-

-//

-//

-//

-

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)

-

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \

-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \

-   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))

-

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \

-  S = sub_group_scan_exclusive_add(C)

-

-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \

-  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)

-

-//

-//

-//

-

-struct skc_reclaim

-{

-  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];

-};

-

-__kernel

-SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS

-void

-skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring

-                           __global skc_uint                * const bp_elems,    // block pool blocks

-                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics

-                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring

-                           __global skc_block_id_t const    * const map,         // raster host-to-device map

-                           struct   skc_reclaim               const reclaim)     // array of host raster ids

-{

-#if (__OPENCL_VERSION__ < 200)

-  skc_uint const reclaim_stride = get_num_sub_groups();

-#else

-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups

-#endif

-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();

-

-#if 0

-  //

-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT

-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL

-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE

-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.

-  //

-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)

-#endif

-    {

-      // get host raster id

-      skc_raster_h const raster = reclaim.aN[reclaim_idx];

-

-      // get block id of raster header

-      skc_block_id_t     id     = map[raster];

-

-      //

-      // load all of the head block ttxk.lo keys into registers

-      //

-      // FIXME -- this pattern lends itself to using the higher

-      // performance Intel GEN block load instructions

-      //

-      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];

-

-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-      //

-      // pick out count.nodes and count.prims from the header

-      //

-      // load raster header counts -- we only need the blocks and

-      // nodes words the keys are doublewords.

-      //

-      // FIXME -- this can be made portable with compile-time macro expansion

-      //

-      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES

-      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS

-

-#if 0

-      if (get_sub_group_local_id() == 0) {

-        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);

-      }

-#endif

-      //

-      // acquire a span in the block pool ids ring for reclaimed ids

-      //

-      skc_uint bp_ids_base = 0;

-

-      if (get_sub_group_local_id() == 0) {

-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);

-      }

-

-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);

-

-      //

-      // mask off everything but the block id

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \

-        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \

-      }

-

-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-      //

-      // swap current id with next

-      //

-      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)

-        {

-          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);

-

-          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;

-

-          id = next;

-#if 0

-          printf("rasters next = %u\n",id);

-#endif

-        }

-

-#if 0

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                 \

-        printf("%08X %u\n",h##I,h##I);

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-#endif

-      

-#if 0

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \

-        printf("%08X\n",h##I);                          \

-      }

-

-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-#endif

-

-      //

-      // - we'll skip subgroups that are entirely header

-      //

-      // - but we need to mark any header elements that partially fill

-      //   a subgroup as subblocks

-      //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                         \

-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \

-        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \

-          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \

-            h##I = SKC_UINT_MAX;                        \

-          }                                             \

-        }                                               \

-      }

-

-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-      {

-        //

-        // count reclaimable blocks in each lane

-        //

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \

-          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \

-        }

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // scan to find index of each block

-        //

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );

-

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);

-

-        //

-        // store blocks back to ring

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \

-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \

-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \

-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \

-          if (count > 0) {                                              \

-            bp_ids[bp_ids_idx] = h##I;                                  \

-          }                                                             \

-          skc_uint const total = index + count;                         \

-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \

-        }

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-      }

-

-      // printf("R %7u ! %u\n",bp_ids_idx,h##I);

-            

-      //

-      // we're done if it was just the header

-      //

-      if (count_nodes == 0)

-        return;

-

-      //

-      // otherwise, walk the nodes

-      //

-      do {

-        // id of next block is in last lane

-        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);

-

-        //

-        // load all of the node block ttxk.lo keys into registers

-        //

-        // FIXME -- this pattern lends itself to using the higher

-        // performance Intel GEN block load instructions

-        //

-        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // mask off everything but the block id

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                 \

-        n##I = n##I & SKC_TTXK_LO_MASK_ID;

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // swap current id with next

-        //

-        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)

-          {

-            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);

-

-            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;

-

-            id = next;

-#if 0

-            printf("rasters next = %u\n",id);            

-#endif

-          }

-

-#if 0

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                 \

-        printf("%08X %u\n",n##I,n##I);

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-#endif

-

-        //

-        // count reclaimable blocks in each lane

-        //

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R)                                         \

-        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-        //

-        // scan to find index of each block

-        //

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );

-

-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);

-

-        //

-        // store blocks back to ring

-        //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \

-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \

-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \

-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \

-          if (count > 0) {                                              \

-            bp_ids[bp_ids_idx] = n##I;                                  \

-          }                                                             \

-          skc_uint const total = index + count;                         \

-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \

-        }

-

-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();

-

-        // printf("R %7u ! %u\n",bp_ids_idx,n##I);

-        

-        // any more nodes?

-      } while (--count_nodes > 0);

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
+
+//
+//
+//
+
+#if   ( SKC_RASTERS_RECLAIM_X == 1 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_RASTERS_RECLAIM_X == 2 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_RASTERS_RECLAIM_X == 4 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_RASTERS_RECLAIM_X == 8 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_RASTERS_RECLAIM_X == 16)
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_RASTERS_RECLAIM_X"
+#endif
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
+                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
+
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \
+  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
+
+#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \
+  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \
+  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \
+  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                           __global skc_uint                * const bp_elems,    // block pool blocks
+                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                           __global skc_block_id_t const    * const map,         // raster host-to-device map
+                           struct   skc_reclaim               const reclaim)     // array of host raster ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host raster id
+      skc_raster_h const raster = reclaim.aN[reclaim_idx];
+
+      // get block id of raster header
+      skc_block_id_t     id     = map[raster];
+
+      //
+      // load all of the head block ttxk.lo keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      // load raster header counts -- we only need the blocks and
+      // nodes words the keys are doublewords.
+      //
+      // FIXME -- this can be made portable with compile-time macro expansion
+      //
+      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
+      }
+#endif
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // mask off everything but the block id
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+#if 0
+          printf("rasters next = %u\n",id);
+#endif
+        }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",h##I,h##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+      
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        printf("%08X\n",h##I);                          \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as subblocks
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \
+          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \
+            h##I = SKC_UINT_MAX;                        \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+      }
+
+      // printf("R %7u ! %u\n",bp_ids_idx,h##I);
+            
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
+
+        //
+        // load all of the node block ttxk.lo keys into registers
+        //
+        // FIXME -- this pattern lends itself to using the higher
+        // performance Intel GEN block load instructions
+        //
+        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // mask off everything but the block id
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        n##I = n##I & SKC_TTXK_LO_MASK_ID;
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+#if 0
+            printf("rasters next = %u\n",id);            
+#endif
+          }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",n##I,n##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("R %7u ! %u\n",bp_ids_idx,n##I);
+        
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/render.cl b/src/compute/skc/platforms/cl_12/kernels/render.cl
index 9205334..a7b3229 100644
--- a/src/compute/skc/platforms/cl_12/kernels/render.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/render.cl
@@ -1,2165 +1,2165 @@
-/*

- * Copyright 2016 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-//

-//

-

-#include "tile.h"

-#include "block.h"

-#include "styling_types.h"

-#include "atomic_cl.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)

-

-//

-//

-//

-

-#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0

-

-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1

-

-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3

-

-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7

-

-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()

-#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15

-#endif

-

-//

-// tile state flag bits

-//

-

-typedef enum skc_tile_flags_e {

-

-  // FLUSH

-  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,

-  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,

-  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,

-

-  // OPACITY

-  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,

-

-  //

-  // Note: testing for opacity and skipping scattering is on its way

-  // to becoming a much more programmable option because sometimes we

-  // may be compositing/blending from back-to-front and/or be using

-  // group blend rules that ignore opacity.

-  //

-  // The point is that all of these decisions should be encoded in

-  // styling commands and, as much as possible, removed from the final

-  // group/layer styling traversal render loop.

-  //

-

-} skc_tile_flags_e;

-

-//

-// COVER -- assumes availability of either fp16 or fp32

-//

-

-union skc_tile_cover

-{

-  struct {

-    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];

-  } aN;

-

-#ifdef SKC_RENDER_TILE_COVER_VECTOR

-  struct {

-    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];

-  } vN;

-#endif

-};

-

-//

-// COLOR -- assumes availability of either fp16 or fp32

-//

-

-union skc_tile_color

-{

-  union {

-    struct {

-      SKC_RENDER_TILE_COLOR           r;

-      SKC_RENDER_TILE_COLOR           g;

-      SKC_RENDER_TILE_COLOR           b;

-      SKC_RENDER_TILE_COLOR           a;

-    } rgba[SKC_TILE_WIDTH];

-  } aN;

-

-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED

-  union {

-    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];

-  } iN;

-#endif

-

-#ifdef SKC_RENDER_TILE_COLOR_VECTOR

-  union {

-    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];

-  } vN;

-#endif

-

-  struct {

-    union {

-      struct {

-        SKC_RENDER_TILE_COLOR         r;

-        SKC_RENDER_TILE_COLOR         g;

-      };

-      SKC_RENDER_GRADIENT_FLOAT       distance;

-    };

-    union {

-      struct {

-        SKC_RENDER_TILE_COLOR         b;

-        SKC_RENDER_TILE_COLOR         a;

-      };

-      SKC_RENDER_GRADIENT_FLOAT       stoplerp;

-    };

-  } grad[SKC_TILE_WIDTH];

-};

-

-//

-// SHARED MEMORY STATE

-//

-

-#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)

-

-#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)

-#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))

-

-//

-//

-//

-

-union skc_subgroup_smem

-{

-  //

-  // The tiles are stored in column-major / height-major order

-  //

-  // The final column is a guard column that is OK to write to but

-  // will never be read.  It simplifies the TTSB scatter but could be

-  // predicated if SMEM is really at a premium.

-  //

-#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )

-  struct {

-    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]

-  } atomic;

-#endif

-

-  struct {

-    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]

-  } aN;

-

-  struct { // assumption is that height = subgroup

-    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];

-  } vN;

-

-  struct { // assumption is that height = subgroup

-    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];

-  } wide;

-

-  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];

-

-  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];

-

-#if 0

-  //

-  // SPILL TO GMEM

-  //

-#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)

-  struct {

-

-#if (SKC_REGS_COLOR_S > 0)

-    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];

-#endif

-

-#if (SKC_REGS_COVER_S > 0)

-    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];

-#endif

-

-  } regs;

-#endif

-  //

-  //

-  //

-#endif

-};

-

-//

-//

-//

-

-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )

-

-#define skc_subgroup_lane()  0

-

-#else

-

-#define skc_subgroup_lane()  get_sub_group_local_id()

-

-#endif

-

-//

-//

-//

-

-typedef skc_uint  skc_ttsk_lo_t;

-typedef skc_uint  skc_ttsk_hi_t;

-

-typedef skc_uint  skc_ttpk_lo_t;

-typedef skc_uint  skc_ttpk_hi_t;

-

-typedef skc_uint  skc_ttxk_lo_t;

-typedef skc_uint  skc_ttxk_hi_t;

-

-typedef skc_uint  skc_ttck_lo_t;

-typedef skc_uint  skc_ttck_hi_t;

-

-typedef skc_uint2 skc_ttck_t;

-

-typedef skc_int   skc_ttxb_t;

-

-//

-// TTCK (32-BIT COMPARE) v1:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          30          |    1   |    1   |   18  |  7  |  7  |

-//

-//

-// TTCK (32-BIT COMPARE) v2:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          30          |    1   |    1   |   15  |  9  |  8  |

-//

-//

-// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:

-//

-//  0                                                           63

-//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |

-//  +----------------------+--------+--------+-------+-----+-----+

-//  |          27          |    1   |    1   |   18  |  9  |  8  |

-//

-

-static

-skc_uint

-skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)

-{

-  return a & SKC_TTCK_LO_MASK_ID;

-}

-

-static

-skc_layer_id

-skc_ttck_get_layer(skc_ttck_t const a)

-{

-  //

-  // FIXME -- a union with a ulong and a shift down and mask is

-  // probably faster on some architectures

-  //

-  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);

-  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;

-

-  return lo | hi;

-}

-

-static

-skc_uint

-skc_ttck_hi_get_x(skc_ttck_hi_t const a)

-{

-  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);

-}

-

-static

-skc_uint

-skc_ttck_hi_get_y(skc_ttck_hi_t const a)

-{

-  return a >> SKC_TTCK_HI_OFFSET_Y;

-}

-

-static

-skc_bool

-skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)

-{

-  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);

-  skc_uint const hi = (a.hi ^ b.hi);

-

-  return (lo | hi) == 0;

-}

-

-static

-skc_bool

-skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)

-{

-  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;

-}

-

-static

-skc_bool

-skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)

-{

-  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;

-}

-

-//

-// TILE TRACE SUBPIXEL

-//

-// The subpixels are encoded with either absolute tile coordinates

-// (32-bits) or packed in delta-encoded form form.

-//

-// For 32-bit subpixel packing of a 32x32 tile:

-//

-// A tile X is encoded as:

-//

-//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.

-//

-//   SX :  6 : unsigned subpixel span from min to max x with range

-//             [0,32]. The original direction is not captured. Would

-//             be nice to capture dx but not necessary right now but

-//             could be in the future. <--- SPARE VALUES AVAILABLE

-//

-// A tile Y is encoded as:

-//

-//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.

-//

-//   DY :  6 : signed subpixel delta y1-y0. The range of delta is

-//             [-32,32] but horizontal lines are not encoded so [1,32]

-//             is mapped to [0,31]. The resulting range [-32,31] fits

-//             in 6 bits.

-//

-// TTS:

-//

-//  0                        31

-//  |  TX |  SX  |  TY |  DY  |

-//  +-----+------+-----+------+

-//  |  10 |   6  |  10 |   6  |

-//

-

-static

-SKC_RENDER_TTS_V_BITFIELD

-skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)

-{

-  //

-  // extract the whole pixel y coordinate

-  //

-  return SKC_BFE(a,

-                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,

-                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);

-}

-

-static

-SKC_RENDER_TTS_V_BITFIELD

-skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)

-{

-  //

-  // get the linear array tile index of the pixel

-  //

-  return (((a & SKC_TTS_MASK_TX_PIXEL)

-

-#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)

-           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)

-#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)

-           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)

-#endif

-

-           ) | skc_tts_get_ty_pixel_v(a));

-}

-

-#if 0

-static

-skc_ttx_v_s32_t

-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)

-{

-  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;

-

-  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));

-}

-#else

-static

-SKC_RENDER_TTS_V_BITFIELD

-skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)

-{

-  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;

-

-  return dy - (~a >> 31);

-}

-#endif

-

-static

-SKC_RENDER_TTS_V_BITFIELD

-skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)

-{

-  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);

-}

-

-static

-SKC_RENDER_TTS_V_BITFIELD

-skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)

-{

-  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)

-{

-  //

-  // SIMD / CPU

-  //

-  //      &

-  //

-  // SIMT / GPU

-  //

-  // Note that atomic_init() is likely implemented as a simple

-  // assignment so there is no identifiable performance difference on

-  // current targets.

-  //

-  // If such an architecture appears in the future then we'll probably

-  // still want to implement this zero'ing operation as below but

-  // follow with an appropriate fence that occurs before any scatter

-  // operations.

-  //

-  // The baroque expansion below improves performance on Intel GEN by,

-  // presumably, achieving the 64-byte per clock SLM write as well as

-  // minimizing the overall number of SEND() block initializations and

-  // launches.

-  //

-  // Intel GENx has a documented 64 byte per cycle SLM write limit.

-  // So having each lane in an 8 lane subgroup zero-write 8 bytes is

-  // probably a safe bet (Later: benchmarking backs this up!).

-  //

-  // Note there is no reason at this time to unroll this loop.

-  //

-  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)

-    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );

-}

-

-//

-// Note this is going to be vectorizable on most architectures.

-//

-// The return of the key translation feature might complicate things.

-//

-

-static

-void

-skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,

-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,

-                 skc_block_id_t                                  const pb_id)

-{

-  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();

-

-#if   ( SKC_TILE_RATIO == 1 )

-

-  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];

-

-#elif ( SKC_TILE_RATIO == 2 )

-

-  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);

-

-#else

-

-#error("tile ratio greater than 2 not supported")

-

-#endif

-

-  //

-  // Note there is no need to use an atomic for this operation on the

-  // current group of target platforms... but this may change if

-  // atomic ops truly go through a different path.

-  //

-  // As noted above, this direct increment is probably faster and can

-  // always be followed by a fence.

-  //

-  // Furthermore, note that the key sorting orders all ttck keys

-  // before ttpk keys.

-  //

-

-  //

-  // FIXME -- if the SMEM store is wider than bank word count then we

-  // might want to odd-even interleave the TTP values if the target

-  // device can't handle 64-bit stores

-  //

-

-  //

-  // skipping per-key translation for now

-  //

-  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);

-}

-

-//

-// Note that skc_scatter_ttsb is *not* vectorizable unless the

-// architecture supports a "scatter-add" capability.  All relevant

-// GPUs support atomic add on shared/local memory and thus support

-// scatter-add.

-//

-

-static

-void

-skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,

-                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,

-                 skc_block_id_t                                  const sb_id)

-{

-  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

-

-  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];

-

-  //

-  // Skipping per-key translation for now

-  //

-

-  // Index into tile

-  //

-  // The tiles are stored in column-major / height-major order

-  //

-  // The final column is a guard column that is OK to write to but

-  // will never be read.  It simplifies the TTSB scatter but could be

-  // predicated if SMEM is really at a premium.

-  //

-

-  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);

-

-#if 0

-  if (tts_v != SKC_TTS_INVALID)

-    printf("(%08X) = %u\n",tts_v,xy_idx);

-#endif

-

-  //

-  // adjust subpixel range to max y

-  //

-  // range is stored as [-32,31] and when read [0,31] is mapped to

-  // [1,32] because a dy of 0 is not possible.

-  //

-  // more succinctly: if dy >= 0 then ++dy

-  //

-  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);

-

-  //

-  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?

-  //

-

-  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"

-  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);

-

-  // Calculate left and right coverage contribution trapezoids

-  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;

-  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;

-

-  //

-  // Accumulate altitudes and areas

-  //

-  // Optimization: if the device supports an CPU/SIMD vector-add or

-  // GPU/SIMT scatter-add atomic int2 add operation then placing the

-  // ALT and AREA values side-by-side would halve the number of

-  // additions.

-  //

-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )

-  //

-  // CPU/SIMD

-  //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                 \

-  if (tts_v C != SKC_TTS_INVALID) {                             \

-    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \

-    smem->aN.area[                  xy_idx C] += right C;       \

-  }

-

-#else

-  //

-  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD

-  //

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A)                                         \

-  if (tts_v C != SKC_TTS_INVALID) {                                     \

-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \

-                                          SKC_TILE_HEIGHT   + xy_idx C, \

-                                          left C);                      \

-    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \

-                                          right C);                     \

-  }

-#endif

-

-  SKC_RENDER_TTSB_EXPAND();

-}

-

-//

-// Note that 2048.0 can be represented exactly with fp16... fortuitous!

-//

-

-#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)

-#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)

-#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)

-#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,

-                       union skc_tile_cover            * SKC_RESTRICT const cover,

-                       union skc_tile_color            * SKC_RESTRICT const color)

-{

-  SKC_RENDER_ACC_COVER_INT area = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];

-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);

-      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));

-

-      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);

-    }

-}

-

-static

-void

-skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,

-                       union skc_tile_cover            * SKC_RESTRICT const cover,

-                       union skc_tile_color            * SKC_RESTRICT const color)

-{

-  SKC_RENDER_ACC_COVER_INT area = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      area                                   += smem->vN.area[ii][skc_subgroup_lane()];

-      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);

-      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));

-

-      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,

-                          uint                                 * SKC_RESTRICT const cmd_next,

-                          union skc_tile_color                 * SKC_RESTRICT const color)

-{

-  //

-  // rgba = solid fill

-  //

-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;

-

-  *cmd_next += 2;

-

-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )

-

-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].r = rg.lo;

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].g = rg.hi;

-

-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].b = ba.lo;

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].a = ba.hi;

-

-#else

-

-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);

-  SKC_RENDER_TILE_COLOR      const r  = rg.lo;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);

-

-  SKC_RENDER_TILE_COLOR      const g  = rg.hi;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);

-

-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);

-  SKC_RENDER_TILE_COLOR      const b  = ba.lo;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);

-

-  SKC_RENDER_TILE_COLOR      const a  = ba.hi;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);

-

-#endif

-}

-

-//

-// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"

-//

-// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/

-//

-// Lerp in two fma/mad ops:

-//

-//    t * b + ((-t) * a + a)

-//

-// Note: OpenCL documents mix() as being implemented as:

-//

-//    a + (b - a) * t

-//

-// But this may be a native instruction on some devices.  For example,

-// on GEN9 there is an LRP "linear interoplation" function but it

-// doesn't appear to support half floats.

-//

-

-#if 1

-#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))

-#else

-#define SKC_LERP(a,b,t)  mix(a,b,t)

-#endif

-

-//

-// CPUs have a mock local address space so copying the gradient header

-// is probably not useful.  Just read directly from global.

-//

-

-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL

-#define SKC_RENDER_GRADIENT_SPACE  __local

-#else

-#define SKC_RENDER_GRADIENT_SPACE  __global

-#endif

-

-//

-// gradient is non-vertical

-//

-// removed the vertical (actually, horizontal) special case

-//

-

-static

-void

-skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,

-                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,

-                                                uint                                 * SKC_RESTRICT const cmd_next,

-                                                union skc_tile_color                 * SKC_RESTRICT const color,

-                                                skc_ttck_hi_t                                       const ttck_hi)

-{

-  //

-  // Where is this tile?

-  //

-  // Note that the gradient is being sampled from pixel centers.

-  //

-  SKC_RENDER_GRADIENT_FLOAT const y =

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P

-    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +

-    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));

-

-  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);

-

-  //

-  // Get starting numerator and denominator

-  //

-  // Note: if gh[0].dx is exactly 0.0f then this is a vertical

-  // gradient and can be handled by a special opcode.

-  //

-  // Note: the mad() ordering is slightly different than the original

-  // CUDA implementation.

-  //

-  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };

-

-  *cmd_next += 4;

-

-  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);

-  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);

-

-  //

-  // Where are columns along gradient vector?

-  //

-  // TODO: Note that the gv_denom isn't multiplied through.

-  //

-  // Please doublecheck this... but I recall that in certain cases

-  // this wipes out some precision and results in minor but noticeable

-  // gradient artifacts.

-  //

-  // All arguments are scalars except gv_numer so a simpler

-  // evaluation might save some flops.

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;

-

-  //

-  // is gradient non-repeating, repeating or reflecting?

-  //

-  switch (commands[(*cmd_next)++].u32)

-    {

-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:

-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);

-      break;

-

-    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:

-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-        color->grad[ii].distance -= floor(color->grad[ii].distance);

-      break;

-

-    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING

-      //

-      // OPTIMIZATION: Can this be done in fewer than ~4 ops?

-      //

-      // Note: OpenCL "rint()" is round-to-nearest-even integer!

-      //

-      // Note: the floor() "round to -inf" op is implemented in the

-      // GEN op 'FRC' so probably don't use trunc() when floor will

-      // suffice.

-      //

-

-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-        {

-          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);

-          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));

-        }

-    }

-

-  //

-  // initialize "stoplerp" for all columns

-  //

-  uint const slope_count = commands[(*cmd_next)++].u32;

-  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME

-

-  {

-    float const slope = commands[(*cmd_next)++].f32;

-

-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-      color->grad[ii].stoplerp = color->grad[ii].distance * slope;

-  }

-

-  //

-  // compute stoplerp for remaining stops

-  //

-  for (int jj=1; jj<slope_count; jj++)

-    {

-      float const floor = (float)jj;

-      float const slope = commands[(*cmd_next)++].f32;

-

-      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);

-    }

-

-  //

-  // copy gradient colors to local memory

-  //

-  uint const gd_n = slope_count + 1;

-

-#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL

-  //

-  // copy entire gradient descriptor to local memory

-  //

-  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)

-    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;

-

-  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;

-#else

-  //

-  // prefetch entire gradient header

-  //

-  // no noticeable impact on performance

-  //

-  // prefetch(&commands[*cmd_next].u32,gh_words);

-  //

-  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;

-#endif

-

-  //

-  // adjust cmd_next so that V1 structure is consumed -- FIXME

-  //

-  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);

-

-  //

-  // lerp between color pair stops

-  //

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      //

-      // Finally, we have the gradient stop index and the color stop

-      // pair lerp fraction

-      //

-      // Note that if these are vector values then a gather operation

-      // must occur -- there may be platforms (AVX-512?) that can

-      // perform an explicit gather on a vector type but it's not

-      // really expressible in OpenCL except implicitly with a

-      // workgroup of work items.

-      //

-      // ***********************

-      //

-      // FIXME -- USE HERB'S SINGLE FMA LERP

-      //

-      // ***********************

-      //

-      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);

-      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));

-

-      {

-        SKC_RENDER_TILE_COLOR lo, hi;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \

-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \

-          lo C                                = cc.lo;                  \

-          hi C                                = cc.hi;                  \

-        }

-

-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-

-        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);

-      }

-

-      //

-      //

-      //

-      {

-        SKC_RENDER_TILE_COLOR lo, hi;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \

-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \

-          lo C                                = cc.lo;                  \

-          hi C                                = cc.hi;                  \

-        }

-

-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-

-        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);

-      }

-

-      //

-      //

-      //

-      {

-        SKC_RENDER_TILE_COLOR lo, hi;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \

-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \

-          lo C                                = cc.lo;                  \

-          hi C                                = cc.hi;                  \

-        }

-

-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-

-        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);

-      }

-

-      //

-      //

-      //

-      {

-        SKC_RENDER_TILE_COLOR lo, hi;

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                                       \

-          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \

-          lo C                                = cc.lo;                  \

-          hi C                                = cc.hi;                  \

-        }

-

-        SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-

-        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);

-      }

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,

-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,

-                    union skc_tile_color const * SKC_RESTRICT const color_wip)

-{

-  //

-  // fralunco = cover.wip * acc.a

-  //

-  // acc.r    =  fralunco * wip.r + acc.r

-  // acc.g    =  fralunco * wip.g + acc.g

-  // acc.b    =  fralunco * wip.b + acc.b

-  // acc.a    = -fralunco * wip.a + acc.a

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;

-

-      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);

-      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);

-      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);

-      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,

-                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,

-                    union skc_tile_color const * SKC_RESTRICT const color_wip)

-{

-  //

-  // cover_min = min(cover.wip,a.acc)

-  //

-  // r.acc =  cover_min * r.wip + r.acc

-  // g.acc =  cover_min * g.wip + g.acc

-  // b.acc =  cover_min * b.wip + b.acc

-  // a.acc = -cover_min * a.wip + a.acc

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);

-

-      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);

-      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);

-      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);

-      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,

-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,

-                        union skc_tile_color const * SKC_RESTRICT const color_wip)

-{

-  //

-  // r.acc = (cover.wip * r.wip) * r.acc

-  // g.acc = (cover.wip * g.wip) * g.acc

-  // b.acc = (cover.wip * b.wip) * b.acc

-  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;

-      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;

-      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;

-      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,

-                        union skc_tile_color       * SKC_RESTRICT const color_acc,

-                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,

-                        union skc_tile_color const * SKC_RESTRICT const color_wip)

-{

-  //

-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip

-  // cover.acc         = cover.acc + cover.wip.contrib

-  //

-  // r.acc =  cover.wip.contrib * r.wip + r.acc

-  // g.acc =  cover.wip.contrib * g.wip + g.acc

-  // b.acc =  cover.wip.contrib * b.wip + b.acc

-  // a.acc = -cover.wip.contrib * a.wip * a.acc

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];

-

-      cover_acc->aN.c[ii]     += contrib;

-

-      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);

-      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);

-      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);

-      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);

-    }

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,

-                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];

-

-#else

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,

-                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];

-

-#else

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,

-                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)

-{

-  //

-  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip

-  // cover.acc         = cover.acc + cover.wip.contrib

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,

-                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)

-{

-  //

-  // cover.wip *= cover.msk

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover->aN.c[ii] = 0;

-

-#else

-  //

-  // GEN9 compiler underperforms on this

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover->vN.c[ii] = 0;

-

-#endif

-}

-

-static

-void

-skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover->aN.c[ii] = 0;

-

-#else

-  //

-  // GEN9 compiler underperforms on this

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover->vN.c[ii] = 0;

-

-#endif

-}

-

-static

-void

-skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover->aN.c[ii] = 0;

-

-#else

-  //

-  // GEN9 compiler underperforms on this

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover->vN.c[ii] = 0;

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover->aN.c[ii] = 1;

-

-#else

-  //

-  // GEN9 compiler underperforms on this

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)

-{

-#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    cover->aN.c[ii] = 1 - cover->aN.c[ii];

-

-#else

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)

-    cover->vN.c[ii] = 1 - cover->vN.c[ii];

-

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)

-{

-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      color->aN.rgba[ii].r = 0;

-      color->aN.rgba[ii].g = 0;

-      color->aN.rgba[ii].b = 0;

-      color->aN.rgba[ii].a = 1;

-    }

-

-#else

-  //

-  // DISABLED ON GEN9 -- probably a compiler bug

-  //

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.even = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.even  = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.odd  = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.odd   = 1;

-#endif

-}

-

-static

-void

-skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)

-{

-#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      color->aN.rgba[ii].r = 0;

-      color->aN.rgba[ii].g = 0;

-      color->aN.rgba[ii].b = 0;

-      color->aN.rgba[ii].a = 1;

-    }

-

-#else

-  //

-  // DISABLED ON GEN9 -- probably a compiler bug

-  //

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.even = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.even  = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].even.odd  = 0;

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    color->vN.rgba[ii].odd.odd   = 1;

-#endif

-}

-

-//

-//

-//

-

-static

-bool

-skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)

-{

-  //

-  // returns true if tile is opaque

-  //

-  // various hacks to test for complete tile opacity

-  //

-  // note that front-to-back currently has alpha at 0.0f -- this can

-  // be harmonized to use a traditional alpha if we want to support

-  // rendering in either direction

-  //

-  // hack -- ADD/MAX/OR all alphas together and test for non-zero

-  //

-  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))

-  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)

-    t += color->aN.rgba[ii].a;

-

-#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )

-  //

-  // SIMD

-  //

-  return !any(t != ( 0 ));

-

-#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )

-  //

-  // SIMT - scalar per lane

-  //

-  return !sub_group_any(t != 0);

-

-#else

-  //

-  // SIMT - vector per lane

-  //

-  return !sub_group_any(any(t != ( 0 )));

-

-#endif

-

-  //

-  // TODO: The alternative vector-per-lane implementation below is

-  // *not* believed to be performant because the terse vector-wide

-  // test is just hiding a series of comparisons and is likely worse

-  // than the blind ADD/MAX/OR'ing of all alphas followed by a single

-  // test.

-  //

-#if 0

-  //

-  // SIMT - vector per lane

-  //

-

-  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))

-  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)

-    {

-      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))

-        return false;

-    }

-

-  return true;

-#endif

-}

-

-//

-//

-//

-

-static

-void

-skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,

-                         uint                                 * SKC_RESTRICT const cmd_next,

-                         union skc_tile_color                 * SKC_RESTRICT const color)

-{

-  //

-  // acc.r = acc.a * r + acc.r

-  // acc.g = acc.a * g + acc.g

-  // acc.b = acc.a * b + acc.b

-  //

-  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;

-

-  *cmd_next += 2;

-

-  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);

-

-  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);

-}

-

-//

-//

-//

-

-// #define SKC_SURFACE_IS_BUFFER

-#ifdef  SKC_SURFACE_IS_BUFFER

-

-static

-void

-skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,

-                              skc_uint                                           const surface_pitch,

-                              union skc_tile_color          const * SKC_RESTRICT const color,

-                              skc_ttck_hi_t                                      const ttck_hi)

-{

-  //

-  // NEW MAJOR OPTIMIZATION:

-  //

-  // Rotating and rasterizing the original world transform by -90

-  // degrees and then rendering the scene scene by +90 degrees enables

-  // all the final surface composite to be perfomed in perfectly

-  // coalesced wide transactions.

-  //

-  // For this reason, linear access to the framebuffer is preferred.

-  //

-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv

-  //

-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES

-  //

-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE

-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.

-  //

-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS

-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS

-  //

-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL

-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER

-  //

-  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;

-  uint const x     = skc_ttck_hi_get_x(ttck_hi);

-  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;

-  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );

-

-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);

-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;

-      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;

-

-      surface[base + ii * pitch] = rgba;

-

-      // printf("%08v2X\n",rgba);

-    }

-}

-

-#else

-

-static

-void

-skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,

-                              union skc_tile_color const * SKC_RESTRICT const color,

-                              skc_ttck_hi_t                                   const ttck_hi)

-{

-  //

-  // NEW MAJOR OPTIMIZATION:

-  //

-  // Rotating and rasterizing the original world transform by -90

-  // degrees and then rendering the scene scene by +90 degrees enables

-  // all the final surface composite to be perfomed in perfectly

-  // coalesced wide transactions.

-  //

-  // For this reason, linear access to the framebuffer is preferred.

-  //

-  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv

-  //

-  // NOTE THIS IS TRANSPOSED BY 90 DEGREES

-  //

-  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE

-  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.

-  //

-  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS

-  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS

-  //

-  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL

-  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER

-  //

-

-#if 1

-  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;

-  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);

-

-  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-    {

-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                       \

-        SKC_RENDER_SURFACE_WRITE(surface,               \

-                                 (int2)(x,y+I),         \

-                                 color->iN.rgba[ii] A); \

-      }

-

-#else

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                               \

-        SKC_RENDER_SURFACE_COLOR const rgba =                   \

-          (SKC_RENDER_SURFACE_COLOR)                            \

-          (color->aN.rgba[ii].r C,                              \

-           color->aN.rgba[ii].g C,                              \

-           color->aN.rgba[ii].b C,                              \

-           1.0);                                                \

-        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \

-      }

-

-#endif

-

-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-

-      x += 1;

-    }

-#else

-    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);

-    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;

-

-    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))

-    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)

-      {

-#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                       \

-        SKC_RENDER_SURFACE_WRITE(surface,               \

-                                 (int2)(x+I,y+ii),      \

-                                 color->iN.rgba[ii] A); \

-      }

-

-#else

-

-#undef  SKC_EXPAND_X

-#define SKC_EXPAND_X(I,S,C,P,A) {                               \

-      SKC_RENDER_SURFACE_COLOR const rgba =                     \

-        (SKC_RENDER_SURFACE_COLOR)                              \

-        (color->aN.rgba[ii].r C,                                \

-        color->aN.rgba[ii].g C,                                 \

-        color->aN.rgba[ii].b C,                                 \

-        1.0);                                                   \

-      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \

-    }

-

-#endif

-

-      SKC_RENDER_SCANLINE_VECTOR_EXPAND();

-    }

-

-#endif

-}

-

-#endif

-

-//

-//

-//

-static

-uint const

-skc_ttck_lane(uint const ttck_idx)

-{

-  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;

-}

-

-//

-// RENDER KERNEL

-//

-

-__kernel

-SKC_RENDER_KERNEL_ATTRIBS

-void

-skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,

-                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,

-                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename

-

-                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys

-                  skc_uint                                                const ttck_count,   // rename: key_count

-

-                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets

-                  skc_uint                                                const tile_count,   // rename: offset_count

-

-                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,

-#ifdef SKC_SURFACE_IS_BUFFER

-                  __global   void                          * SKC_RESTRICT const surface,

-#else

-                  __write_only image2d_t                                        surface,

-#endif

-#ifdef SKC_SURFACE_IS_BUFFER

-                  skc_uint                                                const surface_pitch,

-#endif

-                  uint4                                                   const tile_clip)    // rename: clip

-{

-  //

-  // Each subgroup is responsible for a tile.  No extra subgroups are

-  // launched.

-  //

-  // FIXME -- might be better implemented as a "grid stride loop" if

-  // Intel GEN really has a local memory "quantum" of 4KB which means

-  // we would need to launch 4 subgroups per workgroup.

-  //

-  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.

-  //

-

-  //

-  // declare tile cover and color registers

-  //

-  // this used to be a neat unified struct but the Intel GEN compiler

-  // wasn't cooperating and spilling to private memory even though all

-  // registers were indexed by constants

-  //

-  union skc_tile_color  color_wip;

-  union skc_tile_color  color_acc;

-

-  union skc_tile_cover  cover_wip;

-  union skc_tile_cover  cover_acc;

-  union skc_tile_cover  cover_msk;

-

-  //

-  // which subgroup in the grid is this?

-  //

-  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)

-  // as a uniform but the alternative calculation used when there are

-  // multiple subgroups per workgroup is not cooperating and

-  // driving spillage elsewhere.

-  //

-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )

-  skc_uint const ttck_offset_idx = get_group_id(0);

-#else

-  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();

-#endif

-

-  //

-  // load the starting ttck for this offset and get a bound on the max

-  // number of keys that might be loaded

-  //

-  // these are uniform across all subgroup lanes

-  //

-  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];

-

-  //

-  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)

-  // vector of ttck keys

-  //

-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK

-

-  skc_ttck_t ttck = ttck_keys[ttck_idx];

-

-#else

-

-  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;

-  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;

-  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]

-

-#endif

-

-  //

-  // set up style group/layer state

-  //

-  struct skc_styling_group {

-    union skc_group_range range;

-    skc_uint              depth;

-    skc_uint              id;

-  } group;

-

-  group.range.lo = 0;

-  group.range.hi = SKC_UINT_MAX;

-  group.depth    = 0;

-  group.id       = SKC_UINT_MAX;

-

-  //

-  // start with clear tile opacity, knockout and flag bits

-  //

-  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32

-  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32

-  //

-  skc_uint flags = 0;

-

-  //

-  // declare and initialize accumulators

-  //

-#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )

-  __local union skc_subgroup_smem                      smem[1];

-#else

-  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];

-  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();

-#endif

-

-#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK

-  //

-  // select the initial ttck key

-  //

-  skc_ttck_t ttck;

-#if 0

-  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN

-#else

-  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND

-  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);

-#endif

-

-#endif

-

-  //

-  // save the first key so we know what tile we're in

-  //

-  skc_ttck_t ttck0 = ttck;

-

-  //

-  // evaluate the coarse clip as late as possible

-  //

-  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);

-

-  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))

-    return;

-

-  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);

-

-  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))

-    return;

-

-#if 0

-  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);

-#endif

-

-  //

-  // load -> scatter -> flush

-  //

-  while (true)

-    {

-      // if scattering is disabled then just run through ttck keys

-      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;

-

-      // need to clear accumulators before a scatter loop

-      if (is_scatter_enabled)

-        {

-          skc_tile_aa_zero(smem);

-        }

-

-      do {

-        // skip scattering?

-        if (is_scatter_enabled)

-          {

-            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);

-

-            if (skc_ttck_lo_is_prefix(ttck.lo)) {

-              skc_scatter_ttpb(ttxb_extent,smem,xb_id);

-            } else {

-              skc_scatter_ttsb(ttxb_extent,smem,xb_id);

-            }

-          }

-

-        //

-        // any ttck keys left?

-        //

-        if (++ttck_idx >= ttck_count)

-          {

-            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;

-            break;

-          }

-

-        //

-        // process next ttck key

-        //

-#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK

-        //

-        // SIMD -- read next key

-        //

-        ttck = ttck_keys[ttck_idx];

-#else

-        //

-        // SIMT -- refresh the ttck_s?

-        //

-        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;

-

-        if (ttck_lane_next == 0)

-          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];

-

-        //

-        // broadcast next key to entire subgroup

-        //

-#if 0

-        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN

-#else

-        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND

-        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);

-#endif

-#endif

-        // continue scattering if on same YXL layer

-      } while (skc_ttck_equal_yxl(ttck0,ttck));

-

-      // finalize if no longer on same YX tile

-      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))

-        {

-          // otherwise, unwind the tile styling and exit

-          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;

-        }

-

-      //

-      // given: new layer id from ttxk key

-      //

-      // load [layer id]{ group id, depth }

-      //

-      // if within current group's layer range

-      //

-      //   if at same depth

-      //

-      //     load and execute cover>[mask>]color>blend commands

-      //

-      //   else if not at same depth then move deeper

-      //

-      //     for all groups in group trail from cur depth to new depth

-      //       enter group, saving and initializing regs as necessary

-      //     increment depth and update layer range

-      //     load and execute cover>[mask>]color>blend commands

-      //

-      // else not within layer range

-      //

-      //   exit current group, restoring regs as necessary

-      //   decrement depth and update layer range

-      //

-      //

-      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi

-      union skc_layer_node const layer_node_new = layers[layer_id_new];

-

-      // clear flag that controls group/layer traversal

-      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;

-

-      do {

-        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;

-

-        //

-        // is layer a child of the current parent group?

-        //

-        uint cmd_next = 0;

-

-        if (!unwind && (layer_node_new.parent == group.id))

-          {

-            // execute this layer's cmds

-            cmd_next = layer_node_new.cmds;

-

-            // if this is final then configure so groups get unwound, otherwise we're done

-            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);

-          }

-        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))

-          {

-            //

-            // is layer in a child group?

-            //

-            union skc_group_parents const gp = groups[layer_node_new.parent].parents;

-            uint                    const gn = gp.depth - ++group.depth;

-

-            if (gn == 0)

-              group.id = layer_node_new.parent;

-            else

-              group.id = commands[gp.base + gn - 1].parent;

-

-            // update group layer range

-            group.range = groups[group.id].range;

-

-            // enter current group

-            cmd_next    = groups[group.id].cmds.enter;

-          }

-        else // otherwise, exit this group

-          {

-            // enter current group

-            cmd_next = groups[group.id].cmds.leave;

-

-            // decrement group depth

-            if (--group.depth == 0)

-              {

-                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;

-              }

-            else

-              {

-                // get path_base of current group

-                uint const gnpb = groups[group.id].parents.base;

-

-                // get parent of current group

-                group.id    = commands[gnpb].parent;

-

-                // update group layer range

-                group.range = groups[group.id].range;

-              }

-          }

-

-        //

-        // execute cmds

-        //

-        while (true)

-          {

-            union skc_styling_cmd const cmd = commands[cmd_next++];

-

-            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)

-              {

-              case SKC_STYLING_OPCODE_NOOP:

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_NONZERO:

-                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_EVENODD:

-                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:

-                skc_tile_cover_accumulate(&cover_acc,&cover_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_MASK:

-                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:

-                skc_tile_cover_wip_zero(&cover_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:

-                skc_tile_cover_acc_zero(&cover_acc);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:

-                skc_tile_cover_msk_zero(&cover_msk);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_MASK_ONE:

-                skc_tile_cover_msk_one(&cover_msk);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:

-                skc_tile_cover_msk_invert(&cover_msk);

-                break;

-

-              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:

-                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:

-                //

-                // FIXME -- gradients shouldn't be executing so much

-                // conditional driven code at runtime since we *know*

-                // the gradient style on the host can just create a

-                // new styling command to exploit this.

-                //

-                // FIXME -- it might be time to try using the GPU's

-                // sampler on a linear array of half4 vectors -- it

-                // might outperform the explicit load/lerp routines.

-                //

-                // FIXME -- optimizing for vertical gradients (uhhh,

-                // they're actually horizontal due to the -90 degree

-                // view transform) is nice but is it worthwhile to

-                // have this in the kernel?  Easy to add it back...

-                //

-#if defined( SKC_ARCH_GEN9 )

-                // disable gradients due to exessive spillage -- fix later

-                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);

-#else

-                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);

-#endif

-                break;

-

-              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:

-                skc_tile_color_wip_zero(&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:

-                skc_tile_color_acc_zero(&color_acc);

-                break;

-

-              case SKC_STYLING_OPCODE_BLEND_OVER:

-                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_BLEND_PLUS:

-                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:

-                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:

-                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:

-                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);

-                break;

-

-              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:

-                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);

-                break;

-

-              case SKC_STYLING_OPCODE_BACKGROUND_OVER:

-                skc_tile_background_over(commands,&cmd_next,&color_acc);

-                break;

-

-              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:

-#ifdef SKC_SURFACE_IS_BUFFER

-                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);

-#else

-                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);

-#endif

-                break;

-

-              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:

-                if (skc_tile_color_test_opacity(&color_acc))

-                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;

-                break;

-

-              default:

-                return; // this is an illegal opcode -- trap and die!

-              }

-

-            //

-            // if sign bit is set then this was final command

-            //

-            if (cmd.s32 < 0)

-              break;

-          }

-

-        // continue as long as tile flush isn't complete

-      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);

-

-      // return if was the final flush

-      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)

-        return;

-

-      // update wip ttck_hi

-      ttck0 = ttck;

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "styling_types.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RENDER_SUBGROUP_MASK  (SKC_RENDER_SUBGROUP_SIZE - 1)
+
+//
+//
+//
+
+#if   ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_1()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      0
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_2()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      1
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_4()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      3
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 )
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_8()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      7
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16)
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND()           SKC_EXPAND_16()
+#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST      15
+#endif
+
+//
+// tile state flag bits
+//
+
+typedef enum skc_tile_flags_e {
+
+  // FLUSH
+  SKC_TILE_FLAGS_FLUSH_FINALIZE    = 0x00000001,
+  SKC_TILE_FLAGS_FLUSH_UNWIND      = 0x00000002,
+  SKC_TILE_FLAGS_FLUSH_COMPLETE    = 0x00000004,
+
+  // OPACITY
+  SKC_TILE_FLAGS_SCATTER_SKIP      = 0x00000008,
+
+  //
+  // Note: testing for opacity and skipping scattering is on its way
+  // to becoming a much more programmable option because sometimes we
+  // may be compositing/blending from back-to-front and/or be using
+  // group blend rules that ignore opacity.
+  //
+  // The point is that all of these decisions should be encoded in
+  // styling commands and, as much as possible, removed from the final
+  // group/layer styling traversal render loop.
+  //
+
+} skc_tile_flags_e;
+
+//
+// COVER -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_cover
+{
+  struct {
+    SKC_RENDER_TILE_COVER             c[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COVER_VECTOR
+  struct {
+    SKC_RENDER_TILE_COVER_VECTOR      c[SKC_RENDER_TILE_COVER_VECTOR_COUNT];
+  } vN;
+#endif
+};
+
+//
+// COLOR -- assumes availability of either fp16 or fp32
+//
+
+union skc_tile_color
+{
+  union {
+    struct {
+      SKC_RENDER_TILE_COLOR           r;
+      SKC_RENDER_TILE_COLOR           g;
+      SKC_RENDER_TILE_COLOR           b;
+      SKC_RENDER_TILE_COLOR           a;
+    } rgba[SKC_TILE_WIDTH];
+  } aN;
+
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+  union {
+    SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH];
+  } iN;
+#endif
+
+#ifdef SKC_RENDER_TILE_COLOR_VECTOR
+  union {
+    SKC_RENDER_TILE_COLOR_VECTOR      rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT];
+  } vN;
+#endif
+
+  struct {
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         r;
+        SKC_RENDER_TILE_COLOR         g;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       distance;
+    };
+    union {
+      struct {
+        SKC_RENDER_TILE_COLOR         b;
+        SKC_RENDER_TILE_COLOR         a;
+      };
+      SKC_RENDER_GRADIENT_FLOAT       stoplerp;
+    };
+  } grad[SKC_TILE_WIDTH];
+};
+
+//
+// SHARED MEMORY STATE
+//
+
+#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT)
+
+#define SKC_RENDER_WIDE_AA_BYTES   (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE)
+#define SKC_RENDER_WIDE_AA_WIDTH   (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA))
+
+//
+//
+//
+
+union skc_subgroup_smem
+{
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE > 1 )
+  struct {
+    SKC_ATOMIC_UINT              area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } atomic;
+#endif
+
+  struct {
+    int                          area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h]
+  } aN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_AREA_V            area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE];
+  } vN;
+
+  struct { // assumption is that height = subgroup
+    SKC_RENDER_WIDE_AA           area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE];
+  } wide;
+
+  union skc_styling_cmd          cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT];
+
+  half                           gc  [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2];
+
+#if 0
+  //
+  // SPILL TO GMEM
+  //
+#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0)
+  struct {
+
+#if (SKC_REGS_COLOR_S > 0)
+    union skc_color_r            color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+#if (SKC_REGS_COVER_S > 0)
+    union float                  cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH];
+#endif
+
+  } regs;
+#endif
+  //
+  //
+  //
+#endif
+};
+
+//
+//
+//
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+
+#define skc_subgroup_lane()  0
+
+#else
+
+#define skc_subgroup_lane()  get_sub_group_local_id()
+
+#endif
+
+//
+//
+//
+
+typedef skc_uint  skc_ttsk_lo_t;
+typedef skc_uint  skc_ttsk_hi_t;
+
+typedef skc_uint  skc_ttpk_lo_t;
+typedef skc_uint  skc_ttpk_hi_t;
+
+typedef skc_uint  skc_ttxk_lo_t;
+typedef skc_uint  skc_ttxk_hi_t;
+
+typedef skc_uint  skc_ttck_lo_t;
+typedef skc_uint  skc_ttck_hi_t;
+
+typedef skc_uint2 skc_ttck_t;
+
+typedef skc_int   skc_ttxb_t;
+
+//
+// TTCK (32-BIT COMPARE) v1:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   18  |  7  |  7  |
+//
+//
+// TTCK (32-BIT COMPARE) v2:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          30          |    1   |    1   |   15  |  9  |  8  |
+//
+//
+// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
+//
+//  0                                                           63
+//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
+//  +----------------------+--------+--------+-------+-----+-----+
+//  |          27          |    1   |    1   |   18  |  9  |  8  |
+//
+
+static
+skc_uint
+skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a)
+{
+  return a & SKC_TTCK_LO_MASK_ID;
+}
+
+static
+skc_layer_id
+skc_ttck_get_layer(skc_ttck_t const a)
+{
+  //
+  // FIXME -- a union with a ulong and a shift down and mask is
+  // probably faster on some architectures
+  //
+  skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi  & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER;
+
+  return lo | hi;
+}
+
+static
+skc_uint
+skc_ttck_hi_get_x(skc_ttck_hi_t const a)
+{
+  return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X);
+}
+
+static
+skc_uint
+skc_ttck_hi_get_y(skc_ttck_hi_t const a)
+{
+  return a >> SKC_TTCK_HI_OFFSET_Y;
+}
+
+static
+skc_bool
+skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b)
+{
+  skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE);
+  skc_uint const hi = (a.hi ^ b.hi);
+
+  return (lo | hi) == 0;
+}
+
+static
+skc_bool
+skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b)
+{
+  return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0;
+}
+
+static
+skc_bool
+skc_ttck_lo_is_prefix(skc_ttck_lo_t const a)
+{
+  return (a & SKC_TTCK_LO_MASK_PREFIX) != 0;
+}
+
+//
+// TILE TRACE SUBPIXEL
+//
+// The subpixels are encoded with either absolute tile coordinates
+// (32-bits) or packed in delta-encoded form form.
+//
+// For 32-bit subpixel packing of a 32x32 tile:
+//
+// A tile X is encoded as:
+//
+//   TX : 10 : unsigned min(x0,x1) tile subpixel coordinate.
+//
+//   SX :  6 : unsigned subpixel span from min to max x with range
+//             [0,32]. The original direction is not captured. Would
+//             be nice to capture dx but not necessary right now but
+//             could be in the future. <--- SPARE VALUES AVAILABLE
+//
+// A tile Y is encoded as:
+//
+//   TY : 10 : unsigned min(y0,y1) tile subpixel coordinate.
+//
+//   DY :  6 : signed subpixel delta y1-y0. The range of delta is
+//             [-32,32] but horizontal lines are not encoded so [1,32]
+//             is mapped to [0,31]. The resulting range [-32,31] fits
+//             in 6 bits.
+//
+// TTS:
+//
+//  0                        31
+//  |  TX |  SX  |  TY |  DY  |
+//  +-----+------+-----+------+
+//  |  10 |   6  |  10 |   6  |
+//
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // extract the whole pixel y coordinate
+  //
+  return SKC_BFE(a,
+                 SKC_TTS_BITS_TY   - SKC_SUBPIXEL_RESL_Y_LOG2,
+                 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a)
+{
+  //
+  // get the linear array tile index of the pixel
+  //
+  return (((a & SKC_TTS_MASK_TX_PIXEL)
+
+#if   (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2)
+           >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2)
+#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2)
+           << (SKC_TILE_HEIGHT_LOG2     - SKC_SUBPIXEL_RESL_X_LOG2)
+#endif
+
+           ) | skc_tts_get_ty_pixel_v(a));
+}
+
+#if 0
+static
+skc_ttx_v_s32_t
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY;
+
+  return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31));
+}
+#else
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_dy_v(SKC_RENDER_TTS_V const a)
+{
+  SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY;
+
+  return dy - (~a >> 31);
+}
+#endif
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a)
+{
+  return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2);
+}
+
+static
+SKC_RENDER_TTS_V_BITFIELD
+skc_tts_get_sx_v(SKC_RENDER_TTS_V const a)
+{
+  return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem)
+{
+  //
+  // SIMD / CPU
+  //
+  //      &
+  //
+  // SIMT / GPU
+  //
+  // Note that atomic_init() is likely implemented as a simple
+  // assignment so there is no identifiable performance difference on
+  // current targets.
+  //
+  // If such an architecture appears in the future then we'll probably
+  // still want to implement this zero'ing operation as below but
+  // follow with an appropriate fence that occurs before any scatter
+  // operations.
+  //
+  // The baroque expansion below improves performance on Intel GEN by,
+  // presumably, achieving the 64-byte per clock SLM write as well as
+  // minimizing the overall number of SEND() block initializations and
+  // launches.
+  //
+  // Intel GENx has a documented 64 byte per cycle SLM write limit.
+  // So having each lane in an 8 lane subgroup zero-write 8 bytes is
+  // probably a safe bet (Later: benchmarking backs this up!).
+  //
+  // Note there is no reason at this time to unroll this loop.
+  //
+  for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++)
+    smem->wide.area[ii][skc_subgroup_lane()] = ( 0 );
+}
+
+//
+// Note this is going to be vectorizable on most architectures.
+//
+// The return of the key translation feature might complicate things.
+//
+
+static
+void
+skc_scatter_ttpb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const pb_id)
+{
+  skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
+
+#if   ( SKC_TILE_RATIO == 1 )
+
+  SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset];
+
+#elif ( SKC_TILE_RATIO == 2 )
+
+  SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent);
+
+#else
+
+#error("tile ratio greater than 2 not supported")
+
+#endif
+
+  //
+  // Note there is no need to use an atomic for this operation on the
+  // current group of target platforms... but this may change if
+  // atomic ops truly go through a different path.
+  //
+  // As noted above, this direct increment is probably faster and can
+  // always be followed by a fence.
+  //
+  // Furthermore, note that the key sorting orders all ttck keys
+  // before ttpk keys.
+  //
+
+  //
+  // FIXME -- if the SMEM store is wider than bank word count then we
+  // might want to odd-even interleave the TTP values if the target
+  // device can't handle 64-bit stores
+  //
+
+  //
+  // skipping per-key translation for now
+  //
+  smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1);
+}
+
+//
+// Note that skc_scatter_ttsb is *not* vectorizable unless the
+// architecture supports a "scatter-add" capability.  All relevant
+// GPUs support atomic add on shared/local memory and thus support
+// scatter-add.
+//
+
+static
+void
+skc_scatter_ttsb(__global skc_ttxb_t        const * SKC_RESTRICT const ttxb_extent,
+                 __local  union skc_subgroup_smem * SKC_RESTRICT const smem,
+                 skc_block_id_t                                  const sb_id)
+{
+  skc_uint         const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
+
+  SKC_RENDER_TTS_V const tts_v  = ttxb_extent[offset];
+
+  //
+  // Skipping per-key translation for now
+  //
+
+  // Index into tile
+  //
+  // The tiles are stored in column-major / height-major order
+  //
+  // The final column is a guard column that is OK to write to but
+  // will never be read.  It simplifies the TTSB scatter but could be
+  // predicated if SMEM is really at a premium.
+  //
+
+  SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v);
+
+#if 0
+  if (tts_v != SKC_TTS_INVALID)
+    printf("(%08X) = %u\n",tts_v,xy_idx);
+#endif
+
+  //
+  // adjust subpixel range to max y
+  //
+  // range is stored as [-32,31] and when read [0,31] is mapped to
+  // [1,32] because a dy of 0 is not possible.
+  //
+  // more succinctly: if dy >= 0 then ++dy
+  //
+  SKC_RENDER_TTS_V_BITFIELD const dy     = skc_tts_get_dy_v(tts_v);
+
+  //
+  // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid?
+  //
+
+  // this "min(x0) * 2 + dx" is equivalent to "x0 + x1"
+  SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v);
+
+  // Calculate left and right coverage contribution trapezoids
+  SKC_RENDER_TTS_V_BITFIELD const left   = dy * widths;
+  SKC_RENDER_TTS_V_BITFIELD const right  = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left;
+
+  //
+  // Accumulate altitudes and areas
+  //
+  // Optimization: if the device supports an CPU/SIMD vector-add or
+  // GPU/SIMT scatter-add atomic int2 add operation then placing the
+  // ALT and AREA values side-by-side would halve the number of
+  // additions.
+  //
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // CPU/SIMD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                 \
+  if (tts_v C != SKC_TTS_INVALID) {                             \
+    smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left  C;       \
+    smem->aN.area[                  xy_idx C] += right C;       \
+  }
+
+#else
+  //
+  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A)                                         \
+  if (tts_v C != SKC_TTS_INVALID) {                                     \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area +           \
+                                          SKC_TILE_HEIGHT   + xy_idx C, \
+                                          left C);                      \
+    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \
+                                          right C);                     \
+  }
+#endif
+
+  SKC_RENDER_TTSB_EXPAND();
+}
+
+//
+// Note that 2048.0 can be represented exactly with fp16... fortuitous!
+//
+
+#define SKC_RENDER_FILL_MAX_AREA          (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y)
+#define SKC_RENDER_FILL_MAX_AREA_2        (2u * SKC_RENDER_FILL_MAX_AREA)
+#define SKC_RENDER_FILL_EVEN_ODD_MASK     (SKC_RENDER_FILL_MAX_AREA_2 - 1)
+#define SKC_RENDER_FILL_MAX_AREA_RCP_F32  (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA)
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_TILE_COVER     const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32);
+    }
+}
+
+static
+void
+skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem,
+                       union skc_tile_cover            * SKC_RESTRICT const cover,
+                       union skc_tile_color            * SKC_RESTRICT const color)
+{
+  SKC_RENDER_ACC_COVER_INT area = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      area                                   += smem->vN.area[ii][skc_subgroup_lane()];
+      SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area);
+      SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA));
+
+      cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                          uint                                 * SKC_RESTRICT const cmd_next,
+                          union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // rgba = solid fill
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR )
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = rg.hi;
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].a = ba.hi;
+
+#else
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const r  = rg.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r);
+
+  SKC_RENDER_TILE_COLOR      const g  = rg.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+  SKC_RENDER_TILE_COLOR      const b  = ba.lo;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b);
+
+  SKC_RENDER_TILE_COLOR      const a  = ba.hi;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a);
+
+#endif
+}
+
+//
+// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
+//
+// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
+//
+// Lerp in two fma/mad ops:
+//
+//    t * b + ((-t) * a + a)
+//
+// Note: OpenCL documents mix() as being implemented as:
+//
+//    a + (b - a) * t
+//
+// But this may be a native instruction on some devices.  For example,
+// on GEN9 there is an LRP "linear interoplation" function but it
+// doesn't appear to support half floats.
+//
+
+#if 1
+#define SKC_LERP(a,b,t)  mad(t,b,mad(-(t),a,a))
+#else
+#define SKC_LERP(a,b,t)  mix(a,b,t)
+#endif
+
+//
+// CPUs have a mock local address space so copying the gradient header
+// is probably not useful.  Just read directly from global.
+//
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+#define SKC_RENDER_GRADIENT_SPACE  __local
+#else
+#define SKC_RENDER_GRADIENT_SPACE  __global
+#endif
+
+//
+// gradient is non-vertical
+//
+// removed the vertical (actually, horizontal) special case
+//
+
+static
+void
+skc_tile_color_fill_gradient_linear_nonvertical(__local  union skc_subgroup_smem     * SKC_RESTRICT const smem,
+                                                __global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                                                uint                                 * SKC_RESTRICT const cmd_next,
+                                                union skc_tile_color                 * SKC_RESTRICT const color,
+                                                skc_ttck_hi_t                                       const ttck_hi)
+{
+  //
+  // Where is this tile?
+  //
+  // Note that the gradient is being sampled from pixel centers.
+  //
+  SKC_RENDER_GRADIENT_FLOAT const y =
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P
+    (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) +
+    (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE));
+
+  float                     const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH);
+
+  //
+  // Get starting numerator and denominator
+  //
+  // Note: if gh[0].dx is exactly 0.0f then this is a vertical
+  // gradient and can be handled by a special opcode.
+  //
+  // Note: the mad() ordering is slightly different than the original
+  // CUDA implementation.
+  //
+  union skc_gradient_vector const gv       = { vload4(0,&commands[*cmd_next].f32) };
+
+  *cmd_next += 4;
+
+  float                     const gv_x_dot = mad(x,gv.dx,gv.p0);
+  SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot);
+
+  //
+  // Where are columns along gradient vector?
+  //
+  // TODO: Note that the gv_denom isn't multiplied through.
+  //
+  // Please doublecheck this... but I recall that in certain cases
+  // this wipes out some precision and results in minor but noticeable
+  // gradient artifacts.
+  //
+  // All arguments are scalars except gv_numer so a simpler
+  // evaluation might save some flops.
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom;
+
+  //
+  // is gradient non-repeating, repeating or reflecting?
+  //
+  switch (commands[(*cmd_next)++].u32)
+    {
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f);
+      break;
+
+    case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING:
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].distance -= floor(color->grad[ii].distance);
+      break;
+
+    default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING
+      //
+      // OPTIMIZATION: Can this be done in fewer than ~4 ops?
+      //
+      // Note: OpenCL "rint()" is round-to-nearest-even integer!
+      //
+      // Note: the floor() "round to -inf" op is implemented in the
+      // GEN op 'FRC' so probably don't use trunc() when floor will
+      // suffice.
+      //
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        {
+          SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance);
+          color->grad[ii].distance = fabs(dist_abs - rint(dist_abs));
+        }
+    }
+
+  //
+  // initialize "stoplerp" for all columns
+  //
+  uint const slope_count = commands[(*cmd_next)++].u32;
+  uint const gd_n_v1     = commands[(*cmd_next)++].u32; // REMOVE ME
+
+  {
+    float const slope = commands[(*cmd_next)++].f32;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      color->grad[ii].stoplerp = color->grad[ii].distance * slope;
+  }
+
+  //
+  // compute stoplerp for remaining stops
+  //
+  for (int jj=1; jj<slope_count; jj++)
+    {
+      float const floor = (float)jj;
+      float const slope = commands[(*cmd_next)++].f32;
+
+      // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+      for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+        color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp);
+    }
+
+  //
+  // copy gradient colors to local memory
+  //
+  uint const gd_n = slope_count + 1;
+
+#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL
+  //
+  // copy entire gradient descriptor to local memory
+  //
+  for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE)
+    smem->cmds[ii].u32 = commands[*cmd_next + ii].u32;
+
+  __local  half const * const SKC_RESTRICT gc = smem->gc + 0;
+#else
+  //
+  // prefetch entire gradient header
+  //
+  // no noticeable impact on performance
+  //
+  // prefetch(&commands[*cmd_next].u32,gh_words);
+  //
+  __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0;
+#endif
+
+  //
+  // adjust cmd_next so that V1 structure is consumed -- FIXME
+  //
+  *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n);
+
+  //
+  // lerp between color pair stops
+  //
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      //
+      // Finally, we have the gradient stop index and the color stop
+      // pair lerp fraction
+      //
+      // Note that if these are vector values then a gather operation
+      // must occur -- there may be platforms (AVX-512?) that can
+      // perform an explicit gather on a vector type but it's not
+      // really expressible in OpenCL except implicitly with a
+      // workgroup of work items.
+      //
+      // ***********************
+      //
+      // FIXME -- USE HERB'S SINGLE FMA LERP
+      //
+      // ***********************
+      //
+      SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp);
+      SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp));
+
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac);
+      }
+
+      //
+      //
+      //
+      {
+        SKC_RENDER_TILE_COLOR lo, hi;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                                       \
+          SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \
+          lo C                                = cc.lo;                  \
+          hi C                                = cc.hi;                  \
+        }
+
+        SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+        color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac);
+      }
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_over(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // fralunco = cover.wip * acc.a
+  //
+  // acc.r    =  fralunco * wip.r + acc.r
+  // acc.g    =  fralunco * wip.g + acc.g
+  // acc.b    =  fralunco * wip.b + acc.b
+  // acc.a    = -fralunco * wip.a + acc.a
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a;
+
+      color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_plus(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                    union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                    union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover_min = min(cover.wip,a.acc)
+  //
+  // r.acc =  cover_min * r.wip + r.acc
+  // g.acc =  cover_min * g.wip + g.acc
+  // b.acc =  cover_min * b.wip + b.acc
+  // a.acc = -cover_min * a.wip + a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a);
+
+      color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_multiply(union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // r.acc = (cover.wip * r.wip) * r.acc
+  // g.acc = (cover.wip * g.wip) * g.acc
+  // b.acc = (cover.wip * b.wip) * b.acc
+  // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha)
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r;
+      color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g;
+      color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b;
+      color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a;
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_blend_knockout(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                        union skc_tile_color       * SKC_RESTRICT const color_acc,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_wip,
+                        union skc_tile_color const * SKC_RESTRICT const color_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+  // r.acc =  cover.wip.contrib * r.wip + r.acc
+  // g.acc =  cover.wip.contrib * g.wip + g.acc
+  // b.acc =  cover.wip.contrib * b.wip + b.acc
+  // a.acc = -cover.wip.contrib * a.wip * a.acc
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii];
+
+      cover_acc->aN.c[ii]     += contrib;
+
+      color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r);
+      color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g);
+      color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b);
+      color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a);
+    }
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_wip(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_wip->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_wip->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_copy_acc(union skc_tile_cover       * SKC_RESTRICT const cover_msk,
+                            union skc_tile_cover const * SKC_RESTRICT const cover_acc)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_msk->aN.c[ii] = cover_acc->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover_msk->vN.c[ii] = cover_acc->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_accumulate(union skc_tile_cover       * SKC_RESTRICT const cover_acc,
+                          union skc_tile_cover const * SKC_RESTRICT const cover_wip)
+{
+  //
+  // cover.wip.contrib = (1.0 - cover.acc) * cover.wip
+  // cover.acc         = cover.acc + cover.wip.contrib
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]);
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_mask(union skc_tile_cover       * SKC_RESTRICT const cover_wip,
+                        union skc_tile_cover const * SKC_RESTRICT const cover_msk)
+{
+  //
+  // cover.wip *= cover.msk
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover_wip->aN.c[ii] *= cover_msk->aN.c[ii];
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+static
+void
+skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 0;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 0;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1;
+
+#else
+  //
+  // GEN9 compiler underperforms on this
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE;
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover)
+{
+#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    cover->aN.c[ii] = 1 - cover->aN.c[ii];
+
+#else
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++)
+    cover->vN.c[ii] = 1 - cover->vN.c[ii];
+
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+static
+void
+skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color)
+{
+#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 )
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      color->aN.rgba[ii].r = 0;
+      color->aN.rgba[ii].g = 0;
+      color->aN.rgba[ii].b = 0;
+      color->aN.rgba[ii].a = 1;
+    }
+
+#else
+  //
+  // DISABLED ON GEN9 -- probably a compiler bug
+  //
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.even = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.even  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].even.odd  = 0;
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    color->vN.rgba[ii].odd.odd   = 1;
+#endif
+}
+
+//
+//
+//
+
+static
+bool
+skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color)
+{
+  //
+  // returns true if tile is opaque
+  //
+  // various hacks to test for complete tile opacity
+  //
+  // note that front-to-back currently has alpha at 0.0f -- this can
+  // be harmonized to use a traditional alpha if we want to support
+  // rendering in either direction
+  //
+  // hack -- ADD/MAX/OR all alphas together and test for non-zero
+  //
+  SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a;
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1)))
+  for (uint ii=1; ii<SKC_TILE_WIDTH; ii++)
+    t += color->aN.rgba[ii].a;
+
+#if ( SKC_RENDER_SUBGROUP_SIZE == 1 )
+  //
+  // SIMD
+  //
+  return !any(t != ( 0 ));
+
+#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 )
+  //
+  // SIMT - scalar per lane
+  //
+  return !sub_group_any(t != 0);
+
+#else
+  //
+  // SIMT - vector per lane
+  //
+  return !sub_group_any(any(t != ( 0 )));
+
+#endif
+
+  //
+  // TODO: The alternative vector-per-lane implementation below is
+  // *not* believed to be performant because the terse vector-wide
+  // test is just hiding a series of comparisons and is likely worse
+  // than the blind ADD/MAX/OR'ing of all alphas followed by a single
+  // test.
+  //
+#if 0
+  //
+  // SIMT - vector per lane
+  //
+
+  // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1)))
+  for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++)
+    {
+      if (sub_group_any(any(color->vN.ba[ii].a != ( 0 ))))
+        return false;
+    }
+
+  return true;
+#endif
+}
+
+//
+//
+//
+
+static
+void
+skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands,
+                         uint                                 * SKC_RESTRICT const cmd_next,
+                         union skc_tile_color                 * SKC_RESTRICT const color)
+{
+  //
+  // acc.r = acc.a * r + acc.r
+  // acc.g = acc.a * g + acc.g
+  // acc.b = acc.a * b + acc.b
+  //
+  __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0;
+
+  *cmd_next += 2;
+
+  SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g);
+
+  SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b);
+}
+
+//
+//
+//
+
+// #define SKC_SURFACE_IS_BUFFER
+#ifdef  SKC_SURFACE_IS_BUFFER
+
+static
+void
+skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface,
+                              skc_uint                                           const surface_pitch,
+                              union skc_tile_color          const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                      const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+  uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE;
+  uint const x     = skc_ttck_hi_get_x(ttck_hi);
+  uint const y     = skc_ttck_hi_get_y(ttck_hi) ;
+  uint const base  = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane();
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+      SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 );
+
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255);
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8;
+      rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16;
+
+      surface[base + ii * pitch] = rgba;
+
+      // printf("%08v2X\n",rgba);
+    }
+}
+
+#else
+
+static
+void
+skc_surface_composite_u8_rgba(__write_only image2d_t                          surface,
+                              union skc_tile_color const * SKC_RESTRICT const color,
+                              skc_ttck_hi_t                                   const ttck_hi)
+{
+  //
+  // NEW MAJOR OPTIMIZATION:
+  //
+  // Rotating and rasterizing the original world transform by -90
+  // degrees and then rendering the scene scene by +90 degrees enables
+  // all the final surface composite to be perfomed in perfectly
+  // coalesced wide transactions.
+  //
+  // For this reason, linear access to the framebuffer is preferred.
+  //
+  // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv
+  //
+  // NOTE THIS IS TRANSPOSED BY 90 DEGREES
+  //
+  // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE
+  // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING.
+  //
+  // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS
+  // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS
+  //
+  // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL
+  // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER
+  //
+
+#if 1
+  int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+  int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+
+  // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+  for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+    {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x,y+I),         \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+        SKC_RENDER_SURFACE_COLOR const rgba =                   \
+          (SKC_RENDER_SURFACE_COLOR)                            \
+          (color->aN.rgba[ii].r C,                              \
+           color->aN.rgba[ii].g C,                              \
+           color->aN.rgba[ii].b C,                              \
+           1.0);                                                \
+        SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba);   \
+      }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+
+      x += 1;
+    }
+#else
+    int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE);
+    int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH;
+
+    // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH)))
+    for (uint ii=0; ii<SKC_TILE_WIDTH; ii++)
+      {
+#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                       \
+        SKC_RENDER_SURFACE_WRITE(surface,               \
+                                 (int2)(x+I,y+ii),      \
+                                 color->iN.rgba[ii] A); \
+      }
+
+#else
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,A) {                               \
+      SKC_RENDER_SURFACE_COLOR const rgba =                     \
+        (SKC_RENDER_SURFACE_COLOR)                              \
+        (color->aN.rgba[ii].r C,                                \
+        color->aN.rgba[ii].g C,                                 \
+        color->aN.rgba[ii].b C,                                 \
+        1.0);                                                   \
+      SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba);  \
+    }
+
+#endif
+
+      SKC_RENDER_SCANLINE_VECTOR_EXPAND();
+    }
+
+#endif
+}
+
+#endif
+
+//
+//
+//
+static
+uint const
+skc_ttck_lane(uint const ttck_idx)
+{
+  return ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+}
+
+//
+// RENDER KERNEL
+//
+
+__kernel
+SKC_RENDER_KERNEL_ATTRIBS
+void
+skc_kernel_render(__global   union  skc_layer_node   const * SKC_RESTRICT const layers,
+                  __global   struct skc_group_node   const * SKC_RESTRICT const groups,
+                  __global   union  skc_styling_cmd  const * SKC_RESTRICT const commands,     // FIXME -- rename
+
+                  __global   skc_ttck_t              const * SKC_RESTRICT const ttck_keys,    // rename: keys
+                  skc_uint                                                const ttck_count,   // rename: key_count
+
+                  __global   uint                    const * SKC_RESTRICT const ttck_offsets, // rename: offsets
+                  skc_uint                                                const tile_count,   // rename: offset_count
+
+                  __global   skc_ttxb_t              const * SKC_RESTRICT const ttxb_extent,
+#ifdef SKC_SURFACE_IS_BUFFER
+                  __global   void                          * SKC_RESTRICT const surface,
+#else
+                  __write_only image2d_t                                        surface,
+#endif
+#ifdef SKC_SURFACE_IS_BUFFER
+                  skc_uint                                                const surface_pitch,
+#endif
+                  uint4                                                   const tile_clip)    // rename: clip
+{
+  //
+  // Each subgroup is responsible for a tile.  No extra subgroups are
+  // launched.
+  //
+  // FIXME -- might be better implemented as a "grid stride loop" if
+  // Intel GEN really has a local memory "quantum" of 4KB which means
+  // we would need to launch 4 subgroups per workgroup.
+  //
+  // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB.
+  //
+
+  //
+  // declare tile cover and color registers
+  //
+  // this used to be a neat unified struct but the Intel GEN compiler
+  // wasn't cooperating and spilling to private memory even though all
+  // registers were indexed by constants
+  //
+  union skc_tile_color  color_wip;
+  union skc_tile_color  color_acc;
+
+  union skc_tile_cover  cover_wip;
+  union skc_tile_cover  cover_acc;
+  union skc_tile_cover  cover_msk;
+
+  //
+  // which subgroup in the grid is this?
+  //
+  // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0)
+  // as a uniform but the alternative calculation used when there are
+  // multiple subgroups per workgroup is not cooperating and
+  // driving spillage elsewhere.
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  skc_uint const ttck_offset_idx = get_group_id(0);
+#else
+  skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id();
+#endif
+
+  //
+  // load the starting ttck for this offset and get a bound on the max
+  // number of keys that might be loaded
+  //
+  // these are uniform across all subgroup lanes
+  //
+  skc_uint ttck_idx = ttck_offsets[ttck_offset_idx];
+
+  //
+  // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide)
+  // vector of ttck keys
+  //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+
+  skc_ttck_t ttck = ttck_keys[ttck_idx];
+
+#else
+
+  uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK;
+  uint const ttck_lane = ttck_idx &  SKC_RENDER_SUBGROUP_MASK;
+  skc_ttck_t ttck_s    = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)]
+
+#endif
+
+  //
+  // set up style group/layer state
+  //
+  struct skc_styling_group {
+    union skc_group_range range;
+    skc_uint              depth;
+    skc_uint              id;
+  } group;
+
+  group.range.lo = 0;
+  group.range.hi = SKC_UINT_MAX;
+  group.depth    = 0;
+  group.id       = SKC_UINT_MAX;
+
+  //
+  // start with clear tile opacity, knockout and flag bits
+  //
+  // uint color_acc_opacity  = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32
+  //
+  skc_uint flags = 0;
+
+  //
+  // declare and initialize accumulators
+  //
+#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 )
+  __local union skc_subgroup_smem                      smem[1];
+#else
+  __local union skc_subgroup_smem                      smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS];
+  __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id();
+#endif
+
+#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+  //
+  // select the initial ttck key
+  //
+  skc_ttck_t ttck;
+#if 0
+  ttck    = sub_group_broadcast(ttck_s,ttck_lane);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+  ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND
+  ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane);
+#endif
+
+#endif
+
+  //
+  // save the first key so we know what tile we're in
+  //
+  skc_ttck_t ttck0 = ttck;
+
+  //
+  // evaluate the coarse clip as late as possible
+  //
+  skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi);
+
+  if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x))
+    return;
+
+  skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi);
+
+  if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y))
+    return;
+
+#if 0
+  printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y);
+#endif
+
+  //
+  // load -> scatter -> flush
+  //
+  while (true)
+    {
+      // if scattering is disabled then just run through ttck keys
+      bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0;
+
+      // need to clear accumulators before a scatter loop
+      if (is_scatter_enabled)
+        {
+          skc_tile_aa_zero(smem);
+        }
+
+      do {
+        // skip scattering?
+        if (is_scatter_enabled)
+          {
+            skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo);
+
+            if (skc_ttck_lo_is_prefix(ttck.lo)) {
+              skc_scatter_ttpb(ttxb_extent,smem,xb_id);
+            } else {
+              skc_scatter_ttsb(ttxb_extent,smem,xb_id);
+            }
+          }
+
+        //
+        // any ttck keys left?
+        //
+        if (++ttck_idx >= ttck_count)
+          {
+            flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+            break;
+          }
+
+        //
+        // process next ttck key
+        //
+#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK
+        //
+        // SIMD -- read next key
+        //
+        ttck = ttck_keys[ttck_idx];
+#else
+        //
+        // SIMT -- refresh the ttck_s?
+        //
+        uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK;
+
+        if (ttck_lane_next == 0)
+          ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)];
+
+        //
+        // broadcast next key to entire subgroup
+        //
+#if 0
+        ttck    = sub_group_broadcast(ttck_s,ttck_lane_next);    // SHOULD WORK BUT .4454 COMPILER IS BROKEN
+#else
+        ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND
+        ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next);
+#endif
+#endif
+        // continue scattering if on same YXL layer
+      } while (skc_ttck_equal_yxl(ttck0,ttck));
+
+      // finalize if no longer on same YX tile
+      if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi))
+        {
+          // otherwise, unwind the tile styling and exit
+          flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE;
+        }
+
+      //
+      // given: new layer id from ttxk key
+      //
+      // load [layer id]{ group id, depth }
+      //
+      // if within current group's layer range
+      //
+      //   if at same depth
+      //
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      //   else if not at same depth then move deeper
+      //
+      //     for all groups in group trail from cur depth to new depth
+      //       enter group, saving and initializing regs as necessary
+      //     increment depth and update layer range
+      //     load and execute cover>[mask>]color>blend commands
+      //
+      // else not within layer range
+      //
+      //   exit current group, restoring regs as necessary
+      //   decrement depth and update layer range
+      //
+      //
+      skc_layer_id         const layer_id_new   = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi
+      union skc_layer_node const layer_node_new = layers[layer_id_new];
+
+      // clear flag that controls group/layer traversal
+      flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE;
+
+      do {
+        bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0;
+
+        //
+        // is layer a child of the current parent group?
+        //
+        uint cmd_next = 0;
+
+        if (!unwind && (layer_node_new.parent == group.id))
+          {
+            // execute this layer's cmds
+            cmd_next = layer_node_new.cmds;
+
+            // if this is final then configure so groups get unwound, otherwise we're done
+            flags   |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) ? SKC_TILE_FLAGS_FLUSH_UNWIND : SKC_TILE_FLAGS_FLUSH_COMPLETE);
+          }
+        else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi))
+          {
+            //
+            // is layer in a child group?
+            //
+            union skc_group_parents const gp = groups[layer_node_new.parent].parents;
+            uint                    const gn = gp.depth - ++group.depth;
+
+            if (gn == 0)
+              group.id = layer_node_new.parent;
+            else
+              group.id = commands[gp.base + gn - 1].parent;
+
+            // update group layer range
+            group.range = groups[group.id].range;
+
+            // enter current group
+            cmd_next    = groups[group.id].cmds.enter;
+          }
+        else // otherwise, exit this group
+          {
+            // enter current group
+            cmd_next = groups[group.id].cmds.leave;
+
+            // decrement group depth
+            if (--group.depth == 0)
+              {
+                flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE;
+              }
+            else
+              {
+                // get path_base of current group
+                uint const gnpb = groups[group.id].parents.base;
+
+                // get parent of current group
+                group.id    = commands[gnpb].parent;
+
+                // update group layer range
+                group.range = groups[group.id].range;
+              }
+          }
+
+        //
+        // execute cmds
+        //
+        while (true)
+          {
+            union skc_styling_cmd const cmd = commands[cmd_next++];
+
+            switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE)
+              {
+              case SKC_STYLING_OPCODE_NOOP:
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_NONZERO:
+                skc_tile_cover_nonzero(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_EVENODD:
+                skc_tile_cover_evenodd(smem,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACCUMULATE:
+                skc_tile_cover_accumulate(&cover_acc,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK:
+                skc_tile_cover_wip_mask(&cover_wip,&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_ZERO:
+                skc_tile_cover_wip_zero(&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_ZERO:
+                skc_tile_cover_acc_zero(&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ZERO:
+                skc_tile_cover_msk_zero(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_ONE:
+                skc_tile_cover_msk_one(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_MASK_INVERT:
+                skc_tile_cover_msk_invert(&cover_msk);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_SOLID:
+                skc_tile_color_fill_solid(commands,&cmd_next,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR:
+                //
+                // FIXME -- gradients shouldn't be executing so much
+                // conditional driven code at runtime since we *know*
+                // the gradient style on the host can just create a
+                // new styling command to exploit this.
+                //
+                // FIXME -- it might be time to try using the GPU's
+                // sampler on a linear array of half4 vectors -- it
+                // might outperform the explicit load/lerp routines.
+                //
+                // FIXME -- optimizing for vertical gradients (uhhh,
+                // they're actually horizontal due to the -90 degree
+                // view transform) is nice but is it worthwhile to
+                // have this in the kernel?  Easy to add it back...
+                //
+#if defined( SKC_ARCH_GEN9 )
+                // disable gradients due to exessive spillage -- fix later
+                cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32);
+#else
+                skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_WIP_ZERO:
+                skc_tile_color_wip_zero(&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_ZERO:
+                skc_tile_color_acc_zero(&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_OVER:
+                skc_tile_blend_over(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_PLUS:
+                skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_MULTIPLY:
+                skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_BLEND_KNOCKOUT:
+                skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip);
+                break;
+
+              case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK:
+                // skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_BACKGROUND_OVER:
+                skc_tile_background_over(commands,&cmd_next,&color_acc);
+                break;
+
+              case SKC_STYLING_OPCODE_SURFACE_COMPOSITE:
+#ifdef SKC_SURFACE_IS_BUFFER
+                skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi);
+#else
+                skc_surface_composite_u8_rgba(surface,              &color_acc,ttck0.hi);
+#endif
+                break;
+
+              case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY:
+                if (skc_tile_color_test_opacity(&color_acc))
+                  flags |= SKC_TILE_FLAGS_SCATTER_SKIP;
+                break;
+
+              default:
+                return; // this is an illegal opcode -- trap and die!
+              }
+
+            //
+            // if sign bit is set then this was final command
+            //
+            if (cmd.s32 < 0)
+              break;
+          }
+
+        // continue as long as tile flush isn't complete
+      } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0);
+
+      // return if was the final flush
+      if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE)
+        return;
+
+      // update wip ttck_hi
+      ttck0 = ttck;
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
index 378d51d..7f48978 100644
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
@@ -1,130 +1,130 @@
-/*

- * Copyright 2018 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE

-// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS

-// KERNEL WILL NEED TO BE UPDATED

-//

-

-#include "tile.h"

-#include "atomic_cl.h"

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)

-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)

-

-//

-//

-//

-

-#define SKC_YX_NEQ(row,prev)                \

-  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)

-

-//

-//

-//

-

-__kernel

-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))

-void

-skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,

-                        __global uint                     * SKC_RESTRICT const indices,

-                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)

-{

-  uint const global_id = get_global_id(0);

-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;

-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);

-  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;

-

-  //

-  // LOAD ALL THE ROWS

-  //

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                                           \

-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];

-

-  HS_SLAB_ROWS();

-

-  //

-  // LOAD LAST REGISTER FROM COLUMN TO LEFT

-  //

-  uint  diffs = 0;

-  uint2 r0    = r1;

-

-  if (gmem_base > 0) {

-    // if this is the first key in any slab but the first then it

-    // broadcast loads the last key in previous slab

-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;

-  } else if (get_sub_group_local_id() == 0) {

-    // if this is the first lane in the first slab

-    diffs = 1;

-  }

-

-  // now shuffle in the last key from the column to the left

-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);

-

-  //

-  // FIND ALL DIFFERENCES IN SLAB

-  //

-  uint valid = 0;

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  valid |= ((r##row != SKC_ULONG_MAX) << prev);

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  diffs |= (SKC_YX_NEQ(row,prev) << prev);

-

-  HS_SLAB_ROWS();

-

-  //

-  // SUM UP THE DIFFERENCES

-  //

-  uint const valid_diffs = valid & diffs;

-  uint const count       = popcount(valid_diffs);

-  uint const inclusive   = sub_group_scan_inclusive_add(count);

-  uint const exclusive   = inclusive - count;

-

-  //

-  // RESERVE SPACE IN THE INDICES ARRAY

-  //

-  uint next = 0;

-

-  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)

-    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset

-

-  // distribute base across subgroup

-  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);

-

-  //

-  // STORE THE INDICES

-  //

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (valid_diffs & (1 << prev))                \

-    indices[next++] = lane_idx + prev;

-

-  HS_SLAB_ROWS();

-

-  //

-  // TRANSPOSE THE SLAB AND STORE IT

-  //

-  HS_TRANSPOSE_SLAB();

-}

-

-//

-//

-//

+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+//
+//
+
+#define SKC_YX_NEQ(row,prev)                \
+  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
+                        __global uint                     * SKC_RESTRICT const indices,
+                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
+{
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = r1;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else if (get_sub_group_local_id() == 0) {
+    // if this is the first lane in the first slab
+    diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  //
+  // FIND ALL DIFFERENCES IN SLAB
+  //
+  uint valid = 0;
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= (SKC_YX_NEQ(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+  //
+  // SUM UP THE DIFFERENCES
+  //
+  uint const valid_diffs = valid & diffs;
+  uint const count       = popcount(valid_diffs);
+  uint const inclusive   = sub_group_scan_inclusive_add(count);
+  uint const exclusive   = inclusive - count;
+
+  //
+  // RESERVE SPACE IN THE INDICES ARRAY
+  //
+  uint next = 0;
+
+  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
+    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
+
+  // distribute base across subgroup
+  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
+
+  //
+  // STORE THE INDICES
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (valid_diffs & (1 << prev))                \
+    indices[next++] = lane_idx + prev;
+
+  HS_SLAB_ROWS();
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
index e9accde..9db82d5 100644
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
@@ -1,394 +1,394 @@
-/*

- * Copyright 2018 Google Inc.

- *

- * Use of this source code is governed by a BSD-style license that can

- * be found in the LICENSE file.

- *

- */

-

-//

-// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE

-// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS

-// KERNEL WILL NEED TO BE UPDATED

-//

-

-#include "tile.h"

-#include "raster_builder_cl_12.h" // need meta_in structure

-#include "device_cl_12.h"

-

-//

-//

-//

-

-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)

-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)

-

-//

-// THE BEST TYPE TO ZERO SMEM

-//

-

-#define SKC_ZERO_TYPE  ulong

-#define SKC_ZERO_WORDS 2

-

-//

-// THE ORDER OF COMPONENTS IS:

-//

-// 0: blocks

-// 1: offset

-// 2: pk

-// 3: rk

-//

-

-#if (HS_KEYS_PER_SLAB < 256)

-

-#define SKC_META_TYPE       uint

-#define SKC_META_WORDS      1

-

-#define SKC_COMPONENT_TYPE  uchar

-

-#else

-

-#define SKC_META_TYPE       uint2

-#define SKC_META_WORDS      2

-

-#define SKC_COMPONENT_TYPE  ushort

-

-#endif

-

-//

-//

-//

-

-#if ( SKC_TTRK_HI_BITS_COHORT <= 8)

-#define SKC_COHORT_TYPE uchar

-#else

-#define SKC_COHORT_TYPE ushort

-#endif

-

-//

-//

-//

-

-#define SKC_COHORT_ID(row)                      \

-  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT

-

-//

-// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED

-//

-

-#define SKC_IS_BLOCK(row)                                               \

-  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)

-

-#define SKC_YX(row,prev)                        \

-  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)

-

-#define SKC_IS_PK(row,prev)                             \

-  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)

-

-//

-// COHORT   SIZE IS ALWAYS A POWER-OF-TWO

-// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO

-//

-// COHORT SIZE >= SUBGROUP SIZE

-//

-

-#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)

-

-#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)

-#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))

-#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))

-

-#define SKC_META_COMPONENTS       4

-#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))

-

-//

-//

-//

-

-__kernel

-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))

-void

-skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,

-                        __global uint        * SKC_RESTRICT const metas)

-{

-  __local union

-  {

-    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];

-    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];

-    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];

-  } shared;

-

-  uint const global_id = get_global_id(0);

-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;

-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);

-  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;

-

-  //

-  // LOAD ALL THE ROWS

-  //

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                                           \

-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];

-

-  HS_SLAB_ROWS();

-

-  //

-  // LOAD LAST REGISTER FROM COLUMN TO LEFT

-  //

-  uint  diffs = 0;

-  uint2 r0    = 0;

-

-  if (gmem_base > 0) {

-    // if this is the first key in any slab but the first then it

-    // broadcast loads the last key in previous slab

-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;

-  } else {

-    // otherwise broadcast the first key in the first slab

-    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);

-    // and mark it as an implicit diff

-    if (get_sub_group_local_id() == 0)

-      diffs = 1;

-  }

-

-  // now shuffle in the last key from the column to the left

-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);

-

-  // shift away y/x

-  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;

-

-  //

-  // EXTRACT ALL COHORT IDS EARLY...

-  //

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                           \

-  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);

-

-  HS_SLAB_ROWS();

-

-  //

-  // DEBUG

-  //

-#if 0

-  if (gmem_base == HS_KEYS_PER_SLAB * 7)

-    {

-      if (get_sub_group_local_id() == 0)

-        printf("\n%llX ",as_ulong(r0));

-      else

-        printf("%llX ",as_ulong(r0));

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-      if (get_sub_group_local_id() == 0)        \

-        printf("\n%llX ",r##row);               \

-      else                                      \

-        printf("%llX ",r##row);

-

-      HS_SLAB_ROWS();

-    }

-#endif

-

-  //

-  // CAPTURE ALL CONDITIONS WE CARE ABOUT

-  //

-  // Diffs must be captured before cohorts

-  //

-  uint            valid  = 0;

-  uint            blocks = 0;

-  uint            pks    = 0;

-  SKC_COHORT_TYPE c_max  = 0;

-

-  //

-  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN

-  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE

-  //

-#if 0

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  diffs |= ((c##row != c##prev) << prev);

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  blocks |= (SKC_IS_BLOCK(row) << prev);

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  pks |= SKC_IS_PK(row,prev) << prev);

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  valid |= ((r##row != SKC_ULONG_MAX) << prev);

-

-  HS_SLAB_ROWS();

-

-#else

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (c##row != c##prev)                        \

-    diffs |= 1<<prev;

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (SKC_IS_BLOCK(row))                        \

-    blocks |= 1<<prev;

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (SKC_IS_PK(row,prev))                      \

-    pks |= 1<<prev;

-

-  HS_SLAB_ROWS();

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (r##row != SKC_ULONG_MAX) {                \

-    valid |= 1<<prev;                           \

-    c_max  = max(c_max,c##row);                 \

-  }

-

-  HS_SLAB_ROWS();

-

-#endif

-

-  //

-  // TRANSPOSE THE SLAB AND STORE IT

-  //

-  HS_TRANSPOSE_SLAB();

-

-  // the min cohort is the first key in the slab

-  uint const c_min = sub_group_broadcast(c1,0);

-  

-  // the max cohort is the max across all lanes

-  c_max = sub_group_reduce_max(c_max);

-

-#if 0 // REMOVE ME LATER

-  if (get_sub_group_local_id() == 0)

-    printf("%3u : ( %3u , %3u )\n",

-           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);

-#endif

-

-  //

-  // ZERO SMEM

-  //

-  // zero only the meta info for the cohort ids found in this slab

-  //

-#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)

-  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();

-  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;

-

-  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)

-    shared.z[zz] = 0;

-#else

-  // ERROR -- it's highly unlikely that the zero type is smaller than

-  // the meta type

-#error("Unsupported right now...")

-#endif

-

-  //

-  // ACCUMULATE AND STORE META INFO

-  //

-  uint const    valid_blocks = valid & blocks;

-  uint const    valid_pks    = valid & pks & ~diffs;

-  SKC_META_TYPE meta         = ( 0 );

-

-#define SKC_META_LOCAL_ADD(meta)                \

-  atomic_add(shared.m+HS_REG_LAST(c),meta);

-

-#define SKC_META_LOCAL_STORE(meta,prev)         \

-  shared.m[c##prev] = meta;

-

-  // note this is purposefully off by +1

-#define SKC_META_RESET(meta,curr)               \

-  meta = ((gmem_off + curr) << 8);

-

-#if 0

-

-  // FIXME -- this can be tweaked to shift directly

-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \

-  meta += ((((blocks >> prev) & 1)      ) |     \

-           (((pks    >> prev) & 1) << 16) |     \

-           (((rks    >> prev) & 1) << 24));

-

-#else

-

-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \

-  if (blocks & (1<<prev))                       \

-    meta += 1;                                  \

-  if (pks    & (1<<prev))                       \

-    meta += 1<<16;                              \

-  if (rks    & (1<<prev))                       \

-    meta += 1<<24;

-

-#endif

-

-#undef  HS_SLAB_ROW

-#define HS_SLAB_ROW(row,prev)                   \

-  if (diffs & (1<<prev)) {                      \

-    SKC_META_LOCAL_STORE(meta,prev);            \

-    SKC_META_RESET(meta,row);                   \

-  }                                             \

-  SKC_META_ADD(meta,prev,                       \

-               valid_blocks,                    \

-               valid_pks,                       \

-               valid);

-

-  HS_SLAB_ROWS();

-

-  //

-  // ATOMICALLY ADD THE CARRIED OUT METAS

-  //

-#if 0 // BUG

-  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))

-    SKC_META_LOCAL_ADD(meta);

-#else

-  if (meta != 0)

-    SKC_META_LOCAL_ADD(meta);

-#endif

-

-  //

-  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE

-  //

-

-  // convert the slab offset to an extent offset

-  bool const is_offset = (get_sub_group_local_id() & 3) == 1;

-  uint const adjust    = is_offset ? gmem_base - 1 : 0;

-

-  //

-  // only process the meta components found in this slab

-  //

-  uint const cc_min = c_min * SKC_META_COMPONENTS;

-  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;

-  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();

-

-  if ((cc >= cc_min) && (cc <= cc_max))

-    {

-      uint const c = shared.c[cc];

-

-      if (c != 0)

-        atomic_add(metas+cc,c+adjust);

-    }

-

-  cc += HS_LANES_PER_WARP;

-

-  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)

-    {

-      uint const c = shared.c[cc];

-

-      if (c != 0)

-        atomic_add(metas+cc,c+adjust);

-    }

-}

-

-//

-//

-//

+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+// THE BEST TYPE TO ZERO SMEM
+//
+
+#define SKC_ZERO_TYPE  ulong
+#define SKC_ZERO_WORDS 2
+
+//
+// THE ORDER OF COMPONENTS IS:
+//
+// 0: blocks
+// 1: offset
+// 2: pk
+// 3: rk
+//
+
+#if (HS_KEYS_PER_SLAB < 256)
+
+#define SKC_META_TYPE       uint
+#define SKC_META_WORDS      1
+
+#define SKC_COMPONENT_TYPE  uchar
+
+#else
+
+#define SKC_META_TYPE       uint2
+#define SKC_META_WORDS      2
+
+#define SKC_COMPONENT_TYPE  ushort
+
+#endif
+
+//
+//
+//
+
+#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
+#define SKC_COHORT_TYPE uchar
+#else
+#define SKC_COHORT_TYPE ushort
+#endif
+
+//
+//
+//
+
+#define SKC_COHORT_ID(row)                      \
+  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
+
+//
+// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
+//
+
+#define SKC_IS_BLOCK(row)                                               \
+  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+
+#define SKC_YX(row,prev)                        \
+  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
+
+#define SKC_IS_PK(row,prev)                             \
+  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
+
+//
+// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
+// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
+//
+// COHORT SIZE >= SUBGROUP SIZE
+//
+
+#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)
+
+#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
+#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
+#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+
+#define SKC_META_COMPONENTS       4
+#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+                        __global uint        * SKC_RESTRICT const metas)
+{
+  __local union
+  {
+    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
+    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
+    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = 0;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else {
+    // otherwise broadcast the first key in the first slab
+    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
+    // and mark it as an implicit diff
+    if (get_sub_group_local_id() == 0)
+      diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  // shift away y/x
+  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
+
+  //
+  // EXTRACT ALL COHORT IDS EARLY...
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                           \
+  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
+
+  HS_SLAB_ROWS();
+
+  //
+  // DEBUG
+  //
+#if 0
+  if (gmem_base == HS_KEYS_PER_SLAB * 7)
+    {
+      if (get_sub_group_local_id() == 0)
+        printf("\n%llX ",as_ulong(r0));
+      else
+        printf("%llX ",as_ulong(r0));
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+      if (get_sub_group_local_id() == 0)        \
+        printf("\n%llX ",r##row);               \
+      else                                      \
+        printf("%llX ",r##row);
+
+      HS_SLAB_ROWS();
+    }
+#endif
+
+  //
+  // CAPTURE ALL CONDITIONS WE CARE ABOUT
+  //
+  // Diffs must be captured before cohorts
+  //
+  uint            valid  = 0;
+  uint            blocks = 0;
+  uint            pks    = 0;
+  SKC_COHORT_TYPE c_max  = 0;
+
+  //
+  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
+  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
+  //
+#if 0
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= ((c##row != c##prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  blocks |= (SKC_IS_BLOCK(row) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  pks |= SKC_IS_PK(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#else
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (c##row != c##prev)                        \
+    diffs |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_BLOCK(row))                        \
+    blocks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_PK(row,prev))                      \
+    pks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (r##row != SKC_ULONG_MAX) {                \
+    valid |= 1<<prev;                           \
+    c_max  = max(c_max,c##row);                 \
+  }
+
+  HS_SLAB_ROWS();
+
+#endif
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+
+  // the min cohort is the first key in the slab
+  uint const c_min = sub_group_broadcast(c1,0);
+  
+  // the max cohort is the max across all lanes
+  c_max = sub_group_reduce_max(c_max);
+
+#if 0 // REMOVE ME LATER
+  if (get_sub_group_local_id() == 0)
+    printf("%3u : ( %3u , %3u )\n",
+           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+#endif
+
+  //
+  // ZERO SMEM
+  //
+  // zero only the meta info for the cohort ids found in this slab
+  //
+#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
+  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
+  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
+
+  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+    shared.z[zz] = 0;
+#else
+  // ERROR -- it's highly unlikely that the zero type is smaller than
+  // the meta type
+#error("Unsupported right now...")
+#endif
+
+  //
+  // ACCUMULATE AND STORE META INFO
+  //
+  uint const    valid_blocks = valid & blocks;
+  uint const    valid_pks    = valid & pks & ~diffs;
+  SKC_META_TYPE meta         = ( 0 );
+
+#define SKC_META_LOCAL_ADD(meta)                \
+  atomic_add(shared.m+HS_REG_LAST(c),meta);
+
+#define SKC_META_LOCAL_STORE(meta,prev)         \
+  shared.m[c##prev] = meta;
+
+  // note this is purposefully off by +1
+#define SKC_META_RESET(meta,curr)               \
+  meta = ((gmem_off + curr) << 8);
+
+#if 0
+
+  // FIXME -- this can be tweaked to shift directly
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  meta += ((((blocks >> prev) & 1)      ) |     \
+           (((pks    >> prev) & 1) << 16) |     \
+           (((rks    >> prev) & 1) << 24));
+
+#else
+
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  if (blocks & (1<<prev))                       \
+    meta += 1;                                  \
+  if (pks    & (1<<prev))                       \
+    meta += 1<<16;                              \
+  if (rks    & (1<<prev))                       \
+    meta += 1<<24;
+
+#endif
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (diffs & (1<<prev)) {                      \
+    SKC_META_LOCAL_STORE(meta,prev);            \
+    SKC_META_RESET(meta,row);                   \
+  }                                             \
+  SKC_META_ADD(meta,prev,                       \
+               valid_blocks,                    \
+               valid_pks,                       \
+               valid);
+
+  HS_SLAB_ROWS();
+
+  //
+  // ATOMICALLY ADD THE CARRIED OUT METAS
+  //
+#if 0 // BUG
+  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+    SKC_META_LOCAL_ADD(meta);
+#else
+  if (meta != 0)
+    SKC_META_LOCAL_ADD(meta);
+#endif
+
+  //
+  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
+  //
+
+  // convert the slab offset to an extent offset
+  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
+  uint const adjust    = is_offset ? gmem_base - 1 : 0;
+
+  //
+  // only process the meta components found in this slab
+  //
+  uint const cc_min = c_min * SKC_META_COMPONENTS;
+  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
+  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
+
+  if ((cc >= cc_min) && (cc <= cc_max))
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+
+  cc += HS_LANES_PER_WARP;
+
+  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+}
+
+//
+//
+//
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.c b/src/compute/skc/platforms/cl_12/runtime_cl.c
deleted file mode 100644
index a745ed0..0000000
--- a/src/compute/skc/platforms/cl_12/runtime_cl.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-
-//
-//
-//
-
-#include "runtime_cl.h"
-#include "common/cl/assert_cl.h"
-
-//
-//
-//
-
-static is_verbose = true;
-
-//
-// FIXME -- all variable length device queries need to start querying
-// the parameter's return size before getting its value
-//
-// FIXME -- this is now handled by the common/cl/find.* routine
-//
-
-union skc_cl_device_version {
-  struct {
-    cl_uchar opencl_space[7]; // "OpenCL_"
-    cl_uchar major;
-    cl_uchar dot;
-    cl_uchar minor;
-#if 1 // Intel NEO requires at least 16 bytes
-    cl_uchar space;
-    cl_uchar vendor[32];
-#endif
-  };
-  struct {
-    cl_uchar aN[];
-  };
-};
-
-typedef cl_bitfield cl_diagnostic_verbose_level_intel;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
-
-static
-void 
-CL_CALLBACK 
-skc_context_callback(char const * error, void const * info, size_t size, void * user)
-{
-  if (info != NULL )
-    {
-      fprintf(stderr,"%s\n",error);
-    }
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[])
-{
-  skc_err err = SKC_ERR_SUCCESS;
-  
-  //
-  // search available devices for a match
-  //
-#define PLATFORM_IDS_MAX         16
-#define DEVICE_IDS_MAX           16
-#define PLATFORM_NAME_SIZE_MAX   64
-#define DEVICE_NAME_SIZE_MAX     64
-#define DRIVER_VERSION_SIZE_MAX  64
-
-  cl_int         cl_err;
-
-  cl_platform_id platform_ids[PLATFORM_IDS_MAX];
-  cl_device_id   device_ids  [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
-
-  cl_uint        platform_count;
-  cl_uint        device_count[PLATFORM_IDS_MAX];
-  
-  cl_uint        platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
-
-  bool           match = false; // find _first_ match
-
-  //
-  // get number of platforms
-  //
-  cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
-
-  //
-  // search platforms
-  //
-  for (cl_uint ii=0; ii<platform_count; ii++)
-    {
-      char platform_name[PLATFORM_NAME_SIZE_MAX];
-
-      cl(GetPlatformInfo(platform_ids[ii],
-                         CL_PLATFORM_NAME,
-                         sizeof(platform_name),
-                         platform_name,
-                         NULL));
-
-      if (!match && (strstr(platform_name,target_platform_substring) != NULL)) 
-        {
-          platform_idx = ii;
-        }
-
-      if (is_verbose) {
-        fprintf(stdout,"%2u: %s\n",ii,platform_name);
-      }
-
-      cl_err = clGetDeviceIDs(platform_ids[ii],
-                              CL_DEVICE_TYPE_ALL,
-                              DEVICE_IDS_MAX,
-                              device_ids[ii],
-                              device_count+ii);
-
-      if (cl_err != CL_DEVICE_NOT_FOUND)
-        cl_ok(cl_err);
-
-      for (cl_uint jj=0; jj<device_count[ii]; jj++)
-        {
-          char                        device_name[DEVICE_NAME_SIZE_MAX];
-          union skc_cl_device_version device_version;
-          cl_uint                     device_align_bits;
-          char                        driver_version[DRIVER_VERSION_SIZE_MAX];
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_NAME,
-                           sizeof(device_name),
-                           device_name,
-                           NULL));
-
-          // FIXME -- some of these variable length parameters should
-          // use the "size the param before reading" idiom
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_VERSION,
-                           sizeof(device_version),
-                           device_version.aN,
-                           NULL));
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_MEM_BASE_ADDR_ALIGN,
-                           sizeof(device_align_bits),
-                           &device_align_bits,
-                           NULL));
-          
-          cl_uint const base_align = device_align_bits / 8; // bytes
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DRIVER_VERSION,
-                           sizeof(driver_version),
-                           driver_version,
-                           NULL));
-          
-          if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
-            {
-              match      = true;
-              device_idx = jj;
-
-              runtime_cl->version.major = device_version.major - 48;
-              runtime_cl->version.minor = device_version.minor - 48;
-              runtime_cl->base_align    = base_align;
-
-              if (is_verbose) {
-                fprintf(stdout," >>>");
-              }
-            }
-          else if (is_verbose) 
-            {
-              fprintf(stdout,"    ");
-            }
-
-          if (is_verbose) {
-            fprintf(stdout,
-                    " %1u: %s [ %s ] [ %s ] [ %u ]\n",
-                    jj,
-                    device_name,
-                    device_version.aN,
-                    driver_version,
-                    base_align);
-          }
-        }
-    }
-
-  if (is_verbose) {
-    fprintf(stdout,"\n");
-  }
-
-  //
-  // get target platform and device
-  //
-  if (platform_idx >= platform_count)
-    {
-      fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
-      exit(EXIT_FAILURE);
-    }
-  if (device_idx >= device_count[platform_idx])
-    {
-      fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
-      exit(EXIT_FAILURE);
-    }
-
-  runtime_cl->platform_id = platform_ids[platform_idx];
-  runtime_cl->device_id   = device_ids  [platform_idx][device_idx];
-
-  //
-  // create context
-  //
-
-#if 0
-  cl_context_properties context_properties[] = 
-    { 
-      CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
-      0 
-    };
-#else
-  context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
-#endif
-
-  runtime_cl->context = clCreateContext(context_properties,
-                                    1,
-                                    &runtime_cl->device_id,
-                                    skc_context_callback,
-                                    NULL,
-                                    &cl_err);
-  cl_ok(cl_err);
-
-  //
-  // get device name, driver version, and unified memory flag
-  //
-  if (is_verbose)
-    {
-      char                       device_name[DEVICE_NAME_SIZE_MAX];
-      char                       driver_version[DRIVER_VERSION_SIZE_MAX];
-      cl_bool                    device_is_unified; 
-      cl_device_svm_capabilities svm_caps;
-      size_t                     printf_buffer_size;
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_NAME,
-                       sizeof(device_name),
-                       device_name,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DRIVER_VERSION,
-                       sizeof(driver_version),
-                       driver_version,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_HOST_UNIFIED_MEMORY,
-                       sizeof(device_is_unified),
-                       &device_is_unified,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_SVM_CAPABILITIES,
-                       sizeof(svm_caps),
-                       &svm_caps,
-                       0));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_PRINTF_BUFFER_SIZE,
-                       sizeof(printf_buffer_size),
-                       &printf_buffer_size,
-                       NULL));
-
-      fprintf(stderr,
-              "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER  %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_BUFFER    %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM    %c\n"
-              "CL_DEVICE_SVM_ATOMICS              %c\n"
-              "CL_DEVICE_PRINTF_BUFFER_SIZE       %zu\n\n",
-              svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_ATOMICS             ? '*' : '-',
-              printf_buffer_size);
-    }
-
-  return err;
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
-{
-  // FIXME
-  printf("%s incomplete!\n",__func__);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
-{
-  cl_command_queue cq;
-
-  if (runtime_cl->version.major < 2)
-    {
-      //
-      // <= OpenCL 1.2
-      //
-      cl_int cl_err;
-
-      cq = clCreateCommandQueue(runtime_cl->context,
-                                runtime_cl->device_id,
-                                (cl_command_queue_properties)type,
-                                &cl_err); cl_ok(cl_err);  
-    }
-  else
-    {
-      //
-      // >= OpenCL 2.0
-      //
-      cl_int                    cl_err;
-      cl_queue_properties const queue_properties[] = {
-        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
-      };
-
-      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
-                                              runtime_cl->device_id,
-                                              queue_properties,
-                                              &cl_err); cl_ok(cl_err);
-    }
-
-  return cq;
-}
-
-//
-//
-//
-
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl.h b/src/compute/skc/platforms/cl_12/runtime_cl.h
deleted file mode 100644
index 9e58ca0..0000000
--- a/src/compute/skc/platforms/cl_12/runtime_cl.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-// squelch OpenCL 1.2 deprecation warning
-//
-
-#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#endif
-
-#include <CL/opencl.h>
-
-//
-//
-//
-
-#include "skc.h"
-
-//
-// Minimal OpenCL state needed by the runtime to get started
-//
-
-struct skc_runtime_cl
-{
-  cl_platform_id platform_id;
-  cl_device_id   device_id;
-  cl_context     context;
-  
-  struct {
-    cl_uint      major;
-    cl_uint      minor;
-  } version; // sometimes we need to know this at runtime 
-
-  cl_uint        base_align; // base address alignment for subbuffer origins
-};
-
-//
-//
-//
-
-typedef enum skc_cq_type_e {
-  SKC_CQ_TYPE_IN_ORDER               = 0,
-  SKC_CQ_TYPE_OUT_OF_ORDER           = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-  SKC_CQ_TYPE_IN_ORDER_PROFILING     = (SKC_CQ_TYPE_IN_ORDER     | CL_QUEUE_PROFILING_ENABLE),
-  SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
-} skc_cq_type_e;
-
-//
-// safely creates a generic OpenCL target in very few lines
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[]);
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
-
-//
-// create a command queue with the non-deprecated function
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
-
-//
-//
-//
-
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.c b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
index fca13ed..a4a578f 100644
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
@@ -24,7 +24,6 @@
 #include "grid.h"
 #include "common/cl/assert_cl.h"
 #include "config_cl.h"
-#include "runtime_cl.h"
 #include "runtime_cl_12.h"
 #include "export_cl_12.h"
 
@@ -32,7 +31,7 @@
 //
 //
 
-static 
+static
 void
 skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
 {
@@ -42,7 +41,7 @@
   // create block extent
   skc_extent_pdrw_alloc(runtime,
                         &runtime->block_pool.blocks,
-                        runtime->block_pool.size->pool_size * 
+                        runtime->block_pool.size->pool_size *
                         runtime->config->block.bytes);
 
   // allocate block pool ids
@@ -85,7 +84,7 @@
   cl(ReleaseKernel(k1));
 }
 
-static 
+static
 void
 skc_block_pool_dispose(struct skc_runtime * const runtime)
 {
@@ -106,7 +105,7 @@
 }
 
 static
-void 
+void
 skc_runtime_wait(struct skc_runtime * const runtime)
 {
   skc_scheduler_wait(runtime->scheduler);
@@ -118,18 +117,26 @@
 
 skc_err
 skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[])
+                         cl_context                 context_cl,
+                         cl_device_id               device_id_cl)
 {
   // allocate the runtime
   struct skc_runtime * const runtime = malloc(sizeof(*runtime));
 
-  // acquire OpenCL ids and context for target device
-  skc_err err = skc_runtime_cl_create(&runtime->cl,
-                                      target_platform_substring,
-                                      target_device_substring,
-                                      context_properties);
+  // save off CL objects
+  runtime->cl.context   = context_cl;
+  runtime->cl.device_id = device_id_cl;
+
+  // query device alignment
+  cl_uint align_bits;
+
+  cl(GetDeviceInfo(device_id_cl,
+                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                   sizeof(align_bits),
+                   &align_bits,
+                   NULL));
+
+  runtime->cl.align_bytes = align_bits / 8;
 
   // create device
   skc_device_create(runtime);
@@ -149,7 +156,7 @@
   // initialize cq pool
   skc_cq_pool_create(runtime,
                      &runtime->cq_pool,
-                     runtime->config->cq_pool.type,
+                     runtime->config->cq_pool.cq_props,
                      runtime->config->cq_pool.size);
 
   // acquire in-order cq
@@ -176,7 +183,7 @@
 
   context->yield          = skc_runtime_yield;
   context->wait           = skc_runtime_wait;
-  
+
   context->path_builder   = skc_path_builder_cl_12_create;
   context->path_retain    = skc_runtime_path_host_retain;
   context->path_release   = skc_runtime_path_host_release;
@@ -189,7 +196,7 @@
 
   context->composition    = skc_composition_cl_12_create;
   context->styling        = skc_styling_cl_12_create;
-  
+
   context->surface        = skc_surface_cl_12_create;
 
   // block on pool creation
@@ -198,7 +205,7 @@
   // dispose of in-order cq
   skc_runtime_release_cq_in_order(runtime,cq);
 
-  return err;
+  return SKC_ERR_SUCCESS;
 };
 
 //
@@ -227,7 +234,7 @@
   skc_block_pool_dispose(context->runtime);
 
   // skc_handle_pool_dispose(context->runtime);
-  
+
   return SKC_ERR_SUCCESS;
 }
 
@@ -253,12 +260,12 @@
     return;
 
   QueryPerformanceCounter(&EndingTime);
-  
+
   LARGE_INTEGER ElapsedMicroseconds, Frequency;
 
   ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
 
-  QueryPerformanceFrequency(&Frequency);   
+  QueryPerformanceFrequency(&Frequency);
 
   double const msecs_total  = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
   double const msecs_frame  = msecs_total / SKC_FRAMES;
@@ -268,7 +275,7 @@
 #endif
 
   struct skc_runtime * const runtime = context->runtime;
-  
+
   // acquire out-of-order cq
   cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
 
@@ -311,4 +318,3 @@
 //
 //
 //
-
diff --git a/src/compute/skc/platforms/cl_12/runtime_cl_12.h b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
index 7e7ffcb..ff820e6 100644
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
@@ -12,8 +12,8 @@
 //
 //
 
+#include "skc.h"
 #include "runtime.h"
-#include "runtime_cl.h"
 #include "cq_pool_cl.h"
 #include "handle_pool_cl_12.h"
 #include "block_pool_cl_12.h"
@@ -31,7 +31,11 @@
   //
   // state visible to device
   //
-  struct skc_runtime_cl            cl;
+  struct {
+    cl_context                     context;
+    cl_device_id                   device_id;
+    cl_uint                        align_bytes;
+  } cl;
 
   struct {
     struct skc_allocator_host      host;
@@ -63,9 +67,8 @@
 
 skc_err
 skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[]);
+                         cl_context                 context_cl,
+                         cl_device_id               device_id_cl);
 
 skc_err
 skc_runtime_cl_12_dispose(struct skc_context * const context);
diff --git a/src/compute/skc/raster_builder.c b/src/compute/skc/raster_builder.c
index 6da8071..a0f1fcf 100644
--- a/src/compute/skc/raster_builder.c
+++ b/src/compute/skc/raster_builder.c
@@ -69,7 +69,7 @@
     0.0f, 0.0f         // w0  w1  1  <-- always 1
   };
 
-float const * const skc_transform_identity_ptr = skc_transform_identity;
+// float const * const skc_transform_identity_ptr = skc_transform_identity;
 
 //
 // DEFAULT RASTER CLIP
@@ -82,7 +82,7 @@
     +FLT_MAX, +FLT_MAX  // upper right corner of bounding box
   };
 
-float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
+// float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
 
 #endif
 
diff --git a/src/compute/skc/skc.h b/src/compute/skc/skc.h
index e46b6a9..a81a534 100644
--- a/src/compute/skc/skc.h
+++ b/src/compute/skc/skc.h
@@ -10,125 +10,18 @@
 #define SKC_ONCE_SKC
 
 //
-// FIXME -- get rid of these here
-//
-
-#include <stdint.h>
-#include <stdbool.h>
-
 //
 //
-//
 
-#include "skc_styling.h" // FIXME -- skc_styling
-// #include "skc_err.h"
-
-//
-// FIXME -- move errors to an skc prefixed include
-//
-
-typedef enum skc_err {
-
-  SKC_ERR_SUCCESS                           = 0,
-
-  SKC_ERR_API_BASE                          = 10000,
-
-  SKC_ERR_NOT_IMPLEMENTED                   = SKC_ERR_API_BASE,
-
-  SKC_ERR_POOL_EMPTY,
-
-  SKC_ERR_CONDVAR_WAIT,
-
-  SKC_ERR_LAYER_ID_INVALID,
-  SKC_ERR_LAYER_NOT_EMPTY,
-
-  SKC_ERR_TRANSFORM_WEAKREF_INVALID,
-  SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
-
-  SKC_ERR_COMMAND_NOT_READY,
-  SKC_ERR_COMMAND_NOT_COMPLETED,
-  SKC_ERR_COMMAND_NOT_STARTED,
-
-  SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
-
-  SKC_ERR_COMPOSITION_SEALED,
-  SKC_ERR_STYLING_SEALED,
-
-  SKC_ERR_HANDLE_INVALID,
-  SKC_ERR_HANDLE_OVERFLOW,
-
-  SKC_ERR_COUNT
-
-} skc_err;
-
-//
-// SPINEL TYPES
-//
-
-typedef struct skc_context          * skc_context_t;
-typedef struct skc_path_builder     * skc_path_builder_t;
-typedef struct skc_raster_builder   * skc_raster_builder_t;
-
-typedef struct skc_composition      * skc_composition_t;
-typedef struct skc_styling          * skc_styling_t;
-
-typedef struct skc_surface          * skc_surface_t;
-
-#if 0
-typedef struct skc_interop          * skc_interop_t;
-typedef        uint32_t               skc_interop_surface_t;
-#endif
-
-typedef        uint32_t               skc_path_t;
-typedef        uint32_t               skc_raster_t;
-
-typedef        uint32_t               skc_layer_id;
-typedef        uint32_t               skc_group_id;
-
-typedef        uint32_t               skc_styling_cmd_t;
-
-typedef        uint64_t               skc_weakref_t;
-typedef        skc_weakref_t          skc_transform_weakref_t;
-typedef        skc_weakref_t          skc_raster_clip_weakref_t;
-
-//
-// FIXME -- bury all of this
-//
-
-#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
-#define SKC_GROUP_IDS(...)    _countof(__VA_ARGS__),__VA_ARGS__
-
-//
-//
-//
-
-#define SKC_PATH_INVALID     UINT32_MAX
-#define SKC_RASTER_INVALID   UINT32_MAX
-#define SKC_WEAKREF_INVALID  UINT64_MAX
-
-//
-// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
-//
-
-extern float const * const skc_transform_identity_ptr; // { 1, 0, 0, 0, 1, 0, 0, 0 }
-
-//
-// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
-//
-
-extern float const * const skc_raster_clip_default_ptr;
+#include "skc_err.h"
+#include "skc_types.h"
+#include "skc_styling.h"
 
 //
 // CONTEXT
 //
 
 skc_err
-skc_context_create(skc_context_t       * context,
-                   char          const * target_platform_substring,
-                   char          const * target_device_substring,
-                   intptr_t              context_properties[]);
-
-skc_err
 skc_context_retain(skc_context_t context);
 
 skc_err
@@ -138,31 +31,6 @@
 skc_context_reset(skc_context_t context);
 
 //
-// COORDINATED EXTERNAL OPERATIONS
-//
-
-/*
-  Examples include:
-
-  - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
-  - Subpixel antialiasing using neighboring pixel color and coverage data.
-  - Performing a blit from one region to another region on a surface.
-  - Blitting from one surface to another.
-  - Loading and processing from one region and storing to another region.
-  - Rendezvousing with an external pipeline.
-*/
-
-//
-//
-//
-
-bool
-skc_context_yield(skc_context_t context);
-
-void
-skc_context_wait(skc_context_t context);
-
-//
 // PATH BUILDER
 //
 
@@ -486,6 +354,31 @@
                    void                        * fb); // FIXME FIXME
 
 //
+// COORDINATED EXTERNAL OPERATIONS
+//
+//  Examples include:
+//
+//  - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
+//  - Subpixel antialiasing using neighboring pixel color and coverage data.
+//  - Performing a blit from one region to another region on a surface.
+//  - Blitting from one surface to another.
+//  - Loading and processing from one region and storing to another region.
+//  - Rendezvousing with an external pipeline.
+//
+
+// FORTHCOMING...
+
+//
+// SCHEDULER
+//
+
+bool
+skc_context_yield(skc_context_t context);
+
+void
+skc_context_wait(skc_context_t context);
+
+//
 //
 //
 
diff --git a/src/compute/skc/skc_create_cl.h b/src/compute/skc/skc_create_cl.h
new file mode 100644
index 0000000..0ab0fe0
--- /dev/null
+++ b/src/compute/skc/skc_create_cl.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_CREATE_CL
+#define SKC_ONCE_SKC_CREATE_CL
+
+//
+//
+//
+
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include "CL/opencl.h"
+#endif
+
+//
+//
+//
+
+#include "skc.h"
+
+//
+// CONTEXT CREATION
+//
+
+skc_err
+skc_context_create_cl(skc_context_t * context,
+                      cl_context      context_cl,
+                      cl_device_id    device_id_cl);
+
+//
+// FIXME -- SPECIALIZE SURFACE RENDER
+//
+
+#if 0
+
+//
+// SURFACE RENDER
+//
+
+typedef void (*skc_surface_render_pfn_notify)(skc_surface_t     surface,
+                                              skc_styling_t     styling,
+                                              skc_composition_t composition,
+                                              void            * data);
+skc_err
+skc_surface_render(skc_surface_t                 surface,
+                   uint32_t                const clip[4],
+                   skc_styling_t                 styling,
+                   skc_composition_t             composition,
+                   skc_surface_render_pfn_notify notify,
+                   void                        * data,
+                   void                        * fb); // FIXME FIXME
+
+#endif
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/skc_err.h b/src/compute/skc/skc_err.h
new file mode 100644
index 0000000..6587e7d
--- /dev/null
+++ b/src/compute/skc/skc_err.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_ERR
+#define SKC_ONCE_SKC_ERR
+
+//
+//
+//
+
+typedef enum skc_err {
+
+  SKC_ERR_SUCCESS                           = 0,
+
+  SKC_ERR_API_BASE                          = 10000,
+
+  SKC_ERR_NOT_IMPLEMENTED                   = SKC_ERR_API_BASE,
+
+  SKC_ERR_POOL_EMPTY,
+
+  SKC_ERR_CONDVAR_WAIT,
+
+  SKC_ERR_LAYER_ID_INVALID,
+  SKC_ERR_LAYER_NOT_EMPTY,
+
+  SKC_ERR_TRANSFORM_WEAKREF_INVALID,
+  SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
+
+  SKC_ERR_COMMAND_NOT_READY,
+  SKC_ERR_COMMAND_NOT_COMPLETED,
+  SKC_ERR_COMMAND_NOT_STARTED,
+
+  SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
+
+  SKC_ERR_COMPOSITION_SEALED,
+  SKC_ERR_STYLING_SEALED,
+
+  SKC_ERR_HANDLE_INVALID,
+  SKC_ERR_HANDLE_OVERFLOW,
+
+  SKC_ERR_COUNT
+
+} skc_err;
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
diff --git a/src/compute/skc/skc_styling.h b/src/compute/skc/skc_styling.h
index 73cc4fc..62b9e14 100644
--- a/src/compute/skc/skc_styling.h
+++ b/src/compute/skc/skc_styling.h
@@ -80,6 +80,13 @@
 } skc_styling_gradient_type_e;
 
 //
+// FIXME -- bury all of this once we stabilize styling
+//
+
+#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
+#define SKC_GROUP_IDS(...)    _countof(__VA_ARGS__),__VA_ARGS__
+
+//
 //
 //
 
diff --git a/src/compute/skc/skc_types.h b/src/compute/skc/skc_types.h
new file mode 100644
index 0000000..0dbcf18
--- /dev/null
+++ b/src/compute/skc/skc_types.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_TYPES
+#define SKC_ONCE_SKC_TYPES
+
+//
+//
+//
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//
+//
+//
+
+typedef struct skc_context          * skc_context_t;
+typedef struct skc_path_builder     * skc_path_builder_t;
+typedef struct skc_raster_builder   * skc_raster_builder_t;
+
+typedef struct skc_composition      * skc_composition_t;
+typedef struct skc_styling          * skc_styling_t;
+
+typedef struct skc_surface          * skc_surface_t;
+
+typedef        uint32_t               skc_path_t;
+typedef        uint32_t               skc_raster_t;
+
+typedef        uint32_t               skc_layer_id;
+typedef        uint32_t               skc_group_id;
+
+typedef        uint32_t               skc_styling_cmd_t;
+
+typedef        uint64_t               skc_weakref_t;
+typedef        skc_weakref_t          skc_transform_weakref_t;
+typedef        skc_weakref_t          skc_raster_clip_weakref_t;
+
+#if 0
+typedef struct skc_interop          * skc_interop_t;
+typedef        uint32_t               skc_interop_surface_t;
+#endif
+
+//
+//
+//
+
+#define SKC_PATH_INVALID     UINT32_MAX
+#define SKC_RASTER_INVALID   UINT32_MAX
+#define SKC_WEAKREF_INVALID  UINT64_MAX
+
+//
+// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
+//
+
+//
+// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
+//
+
+//
+//
+//
+
+#endif
+
+//
+//
+//