blob: 7f48978782943e16564878d13b44bd8c88acb134 [file] [log] [blame]
Allan MacKinnonc110e792018-06-21 09:09:56 -07001/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
11// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
12// KERNEL WILL NEED TO BE UPDATED
13//
14
15#include "tile.h"
16#include "atomic_cl.h"
17#include "kernel_cl_12.h"
18
19//
20//
21//
22
23#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
24#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
25
26//
27//
28//
29
30#define SKC_YX_NEQ(row,prev) \
31 (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
32
33//
34//
35//
36
37__kernel
38__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
39void
40skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
41 __global uint * SKC_RESTRICT const indices,
42 __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
43{
44 uint const global_id = get_global_id(0);
45 uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
46 uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
47 uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
48
49 //
50 // LOAD ALL THE ROWS
51 //
52#undef HS_SLAB_ROW
53#define HS_SLAB_ROW(row,prev) \
54 HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
55
56 HS_SLAB_ROWS();
57
58 //
59 // LOAD LAST REGISTER FROM COLUMN TO LEFT
60 //
61 uint diffs = 0;
62 uint2 r0 = r1;
63
64 if (gmem_base > 0) {
65 // if this is the first key in any slab but the first then it
66 // broadcast loads the last key in previous slab
67 r0.hi = as_uint2(vout[gmem_base - 1]).hi;
68 } else if (get_sub_group_local_id() == 0) {
69 // if this is the first lane in the first slab
70 diffs = 1;
71 }
72
73 // now shuffle in the last key from the column to the left
74 r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
75
76 //
77 // FIND ALL DIFFERENCES IN SLAB
78 //
79 uint valid = 0;
80
81#undef HS_SLAB_ROW
82#define HS_SLAB_ROW(row,prev) \
83 valid |= ((r##row != SKC_ULONG_MAX) << prev);
84
85 HS_SLAB_ROWS();
86
87#undef HS_SLAB_ROW
88#define HS_SLAB_ROW(row,prev) \
89 diffs |= (SKC_YX_NEQ(row,prev) << prev);
90
91 HS_SLAB_ROWS();
92
93 //
94 // SUM UP THE DIFFERENCES
95 //
96 uint const valid_diffs = valid & diffs;
97 uint const count = popcount(valid_diffs);
98 uint const inclusive = sub_group_scan_inclusive_add(count);
99 uint const exclusive = inclusive - count;
100
101 //
102 // RESERVE SPACE IN THE INDICES ARRAY
103 //
104 uint next = 0;
105
106 if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
107 next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
108
109 // distribute base across subgroup
110 next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
111
112 //
113 // STORE THE INDICES
114 //
115#undef HS_SLAB_ROW
116#define HS_SLAB_ROW(row,prev) \
117 if (valid_diffs & (1 << prev)) \
118 indices[next++] = lane_idx + prev;
119
120 HS_SLAB_ROWS();
121
122 //
123 // TRANSPOSE THE SLAB AND STORE IT
124 //
125 HS_TRANSPOSE_SLAB();
126}
127
128//
129//
130//