| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| * |
| */ |
| |
| // |
| // |
| // |
| |
| #include <stdlib.h> |
| #include <stdbool.h> |
| #include <string.h> |
| #include <getopt.h> |
| |
| // |
| // |
| // |
| |
| #include "networks.h" |
| #include "macros.h" |
| #include "util.h" |
| |
| // |
| // |
| // |
| |
| #define HSG_INDENT 2 |
| |
| // |
| // |
| // |
| |
| #undef HSG_OP_EXPAND_X |
| #define HSG_OP_EXPAND_X(t) #t , |
| |
| static |
| char const * const |
| hsg_op_type_string[] = |
| { |
| HSG_OP_EXPAND_ALL() |
| }; |
| |
| // |
| // |
| // |
| |
| #define EXIT() (struct hsg_op){ HSG_OP_TYPE_EXIT } |
| |
| #define END() (struct hsg_op){ HSG_OP_TYPE_END } |
| #define BEGIN() (struct hsg_op){ HSG_OP_TYPE_BEGIN } |
| #define ELSE() (struct hsg_op){ HSG_OP_TYPE_ELSE } |
| |
| #define STORE_SLAB_EARLY_EXIT() (struct hsg_op){ HSG_OP_TYPE_STORE_SLAB_EARLY_EXIT } |
| |
| #define FILE_HEADER() (struct hsg_op){ HSG_OP_TYPE_FILE_HEADER } |
| #define FILE_FOOTER() (struct hsg_op){ HSG_OP_TYPE_FILE_FOOTER } |
| |
| #define TRANSPOSE_KERNEL_PROTO() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO } |
| #define TRANSPOSE_KERNEL_PREAMBLE() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE } |
| #define TRANSPOSE_KERNEL_BODY() (struct hsg_op){ HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY } |
| |
| #define BS_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BS_KERNEL_PROTO, { i } } |
| #define BS_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BS_KERNEL_PREAMBLE, { i } } |
| |
| #define BC_KERNEL_PROTO(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PROTO, { i } } |
| #define BC_KERNEL_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_KERNEL_PREAMBLE, { i } } |
| |
| #define FM_KERNEL_PROTO(l,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PROTO, { l, s } } |
| #define FM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_FM_KERNEL_PREAMBLE, { w, s } } |
| |
| #define HM_KERNEL_PROTO(d,w) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PROTO, { d, w } } |
| #define HM_KERNEL_PREAMBLE(w,s) (struct hsg_op){ HSG_OP_TYPE_HM_KERNEL_PREAMBLE, { w, s } } |
| |
| #define BX_REG_GLOBAL_LOAD(n,v) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_LOAD, { n, v } } |
| #define BX_REG_GLOBAL_STORE(n) (struct hsg_op){ HSG_OP_TYPE_BX_REG_GLOBAL_STORE, { n } } |
| |
| #define FM_REG_GLOBAL_LOAD_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT, { n, i } } |
| #define FM_REG_GLOBAL_STORE_LEFT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT, { n, i } } |
| #define FM_REG_GLOBAL_LOAD_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT, { n, i } } |
| #define FM_REG_GLOBAL_STORE_RIGHT(n,i) (struct hsg_op){ HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT, { n, i } } |
| |
| #define HM_REG_GLOBAL_LOAD(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_LOAD, { n, i } } |
| #define HM_REG_GLOBAL_STORE(n,i) (struct hsg_op){ HSG_OP_TYPE_HM_REG_GLOBAL_STORE, { n, i } } |
| |
| #define WARP_FLIP(f) (struct hsg_op){ HSG_OP_TYPE_WARP_FLIP, { f } } |
| #define WARP_HALF(h) (struct hsg_op){ HSG_OP_TYPE_WARP_HALF, { h } } |
| |
| #define CMP_FLIP(a,b,c) (struct hsg_op){ HSG_OP_TYPE_CMP_FLIP, { a, b, c } } |
| #define CMP_HALF(a,b) (struct hsg_op){ HSG_OP_TYPE_CMP_HALF, { a, b } } |
| |
| #define CMP_XCHG(a,b,p) (struct hsg_op){ HSG_OP_TYPE_CMP_XCHG, { a, b, p } } |
| |
| #define BS_REG_SHARED_STORE_V(m,i,r) (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_STORE_V, { m, i, r } } |
| #define BS_REG_SHARED_LOAD_V(m,i,r) (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_V, { m, i, r } } |
| #define BC_REG_SHARED_LOAD_V(m,i,r) (struct hsg_op){ HSG_OP_TYPE_BC_REG_SHARED_LOAD_V, { m, i, r } } |
| |
| #define BX_REG_SHARED_STORE_LEFT(r,i,p) (struct hsg_op){ HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT, { r, i, p } } |
| #define BS_REG_SHARED_STORE_RIGHT(r,i,p) (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT, { r, i, p } } |
| |
| #define BS_REG_SHARED_LOAD_LEFT(r,i,p) (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT, { r, i, p } } |
| #define BS_REG_SHARED_LOAD_RIGHT(r,i,p) (struct hsg_op){ HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT, { r, i, p } } |
| |
| #define BC_REG_GLOBAL_LOAD_LEFT(r,i,p) (struct hsg_op){ HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT, { r, i, p } } |
| |
| #define REG_F_PREAMBLE(s) (struct hsg_op){ HSG_OP_TYPE_REG_F_PREAMBLE, { s } } |
| #define REG_SHARED_STORE_F(r,i,s) (struct hsg_op){ HSG_OP_TYPE_REG_SHARED_STORE_F, { r, i, s } } |
| #define REG_SHARED_LOAD_F(r,i,s) (struct hsg_op){ HSG_OP_TYPE_REG_SHARED_LOAD_F, { r, i, s } } |
| #define REG_GLOBAL_STORE_F(r,i,s) (struct hsg_op){ HSG_OP_TYPE_REG_GLOBAL_STORE_F, { r, i, s } } |
| |
| #define BLOCK_SYNC() (struct hsg_op){ HSG_OP_TYPE_BLOCK_SYNC } |
| |
| #define BS_FRAC_PRED(m,w) (struct hsg_op){ HSG_OP_TYPE_BS_FRAC_PRED, { m, w } } |
| |
| #define BS_MERGE_H_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BS_MERGE_H_PREAMBLE, { i } } |
| #define BC_MERGE_H_PREAMBLE(i) (struct hsg_op){ HSG_OP_TYPE_BC_MERGE_H_PREAMBLE, { i } } |
| |
| #define BX_MERGE_H_PRED(p) (struct hsg_op){ HSG_OP_TYPE_BX_MERGE_H_PRED, { p } } |
| |
| #define BS_ACTIVE_PRED(m,l) (struct hsg_op){ HSG_OP_TYPE_BS_ACTIVE_PRED, { m, l } } |
| |
| #define FM_MERGE_RIGHT_PRED(n,s) (struct hsg_op){ HSG_OP_TYPE_FM_MERGE_RIGHT_PRED, { n, s } } |
| |
| // |
| // DEFAULTS |
| // |
| |
| struct hsg_config hsg_config = // FIXME -- how useful is this? |
| { |
| .merge = { |
| .flip = { |
| .lo = 1, |
| .hi = 1 |
| }, |
| .half = { |
| .lo = 1, |
| .hi = 1 |
| }, |
| |
| .max_log2 = 27 // 2^27th = 128m |
| }, |
| |
| .block = { |
| .warps_min = 1, // min warps for a block that uses smem barriers |
| .warps_max = UINT32_MAX, // max warps for the entire multiprocessor |
| .warps_mod = 2, // the number of warps necessary to load balance horizontal merging |
| |
| .smem_min = 0, |
| .smem_quantum = 1, |
| |
| .smem_bs = 49152, |
| .smem_bc = UINT32_MAX // implies field not set |
| }, |
| |
| .warp = { |
| .lanes = 32, |
| }, |
| |
| .thread = { |
| .regs = 24, |
| .xtra = 0 |
| }, |
| |
| .type = { |
| .words = 2 |
| } |
| }; |
| |
| // |
| // ZERO HSG_MERGE STRUCT |
| // |
| |
| struct hsg_merge hsg_merge[MERGE_LEVELS_MAX_LOG2] = { 0 }; |
| |
| // |
| // |
| // |
| |
| static const hsg_target_pfn hsg_target_pfns[] = |
| { |
| hsg_target_debug, |
| hsg_target_cuda_sm3x, |
| hsg_target_igp_genx, |
| // hsg_target_adreno_5xx, |
| // hsg_target_amd_gcn, |
| // hsg_target_x86_sse, |
| // hsg_target_x86_avx2, |
| }; |
| |
| static const char * hsg_target_pfn_string[] = |
| { |
| "hs_debug", |
| "hs_cuda", |
| "hs_cl" |
| }; |
| |
| static const char * hsg_file_type_string[][2] = |
| { |
| { ".h", ".txt" }, |
| { ".h", ".cu" }, |
| { ".h", ".cl" } |
| }; |
| |
| // |
| // |
| // |
| |
| #define HSG_TARGET_PFN_COUNT ARRAY_LENGTH(hsg_target_pfns) |
| |
| // |
| // |
| // |
| |
| static hsg_op_type hsg_op_type_counts[HSG_OP_TYPE_COUNT] = { 0 }; |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hsg_op_debug() |
| { |
| for (hsg_op_type t=HSG_OP_TYPE_EXIT; t<HSG_OP_TYPE_COUNT; t++) |
| fprintf(stderr,"%-37s : %u\n",hsg_op_type_string[t],hsg_op_type_counts[t]); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hsg_config_init_shared() |
| { |
| // |
| // The assumption here is that a proper smem_bs value was provided |
| // that represents the maximum fraction of the multiprocessor's |
| // available shared memory that can be accessed by the initial block |
| // sorting kernel. |
| // |
| // With CUDA devices this is 48KB out of 48KB, 64KB or 96KB. |
| // |
| // Intel subslices are a little trickier and the minimum allocation |
| // is 4KB and the maximum is 64KB on pre-Skylake IGPs. Sizes are |
| // allocated in 1KB increments. If a maximum of two block sorters |
| // can occupy a subslice then each should be assigned 32KB of shared |
| // memory. |
| // |
| // News Flash: apparently GEN9+ IGPs can allocate 1KB of SMEM per |
| // workgroup so all the previously written logic to support this |
| // issue is being removed. |
| // |
| uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t)); |
| |
| hsg_config.warp.skpw_bs = bs_keys / hsg_merge[0].warps; |
| } |
| |
| static |
| void |
| hsg_merge_levels_init_shared(struct hsg_merge * const merge) |
| { |
| { |
| // |
| // What is the max amount of shared in each possible bs block config? |
| // |
| // The provided smem_bs size will be allocated for each sorting block. |
| // |
| uint32_t const bs_threads = merge->warps * hsg_config.warp.lanes; |
| uint32_t const bs_keys = hsg_config.block.smem_bs / (hsg_config.type.words * sizeof(uint32_t)); |
| uint32_t const bs_kpt = bs_keys / bs_threads; |
| uint32_t const bs_kpt_mod = (bs_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; |
| uint32_t const bs_rows_even = bs_kpt_mod & ~1; // must be even because flip merge only works on row pairs |
| |
| // this is a showstopper |
| if (bs_rows_even < 2) |
| { |
| fprintf(stderr,"Error: need at least 2 rows of shared memory.\n"); |
| exit(-1); |
| } |
| |
| // clamp to number of registers |
| merge->rows_bs = min(bs_rows_even, hsg_config.thread.regs); |
| } |
| |
| // |
| // smem key allocation rule for BC kernels is that a single block |
| // can't allocate more than smem_bs and must allocate at least |
| // smem_min in smem_quantum steps. |
| // |
| // Note that BC blocks will always be less than or equal to BS |
| // blocks. |
| // |
| { |
| // |
| // if merge->warps is not pow2 then we're going to skip creating a bc elsewhere |
| // |
| uint32_t const bc_warps_min = max(merge->warps,hsg_config.block.warps_min); |
| uint32_t const bc_threads = bc_warps_min * hsg_config.warp.lanes; |
| uint32_t const bc_block_rd = (((hsg_config.block.smem_bc * bc_warps_min) / hsg_config.block.warps_max) / |
| hsg_config.block.smem_quantum) * hsg_config.block.smem_quantum; |
| uint32_t const bc_block_max = max(bc_block_rd,hsg_config.block.smem_min); |
| uint32_t const bc_block_smem = min(bc_block_max,hsg_config.block.smem_bs); |
| |
| // what is the max amount of shared in each possible bc block config? |
| uint32_t const bc_keys = bc_block_smem / (hsg_config.type.words * sizeof(uint32_t)); |
| uint32_t const bc_kpt = bc_keys / bc_threads; |
| uint32_t const bc_kpt_mod = (bc_kpt / hsg_config.block.warps_mod) * hsg_config.block.warps_mod; |
| |
| merge->rows_bc = min(bc_kpt_mod, hsg_config.thread.regs); |
| merge->skpw_bc = bc_keys / bc_warps_min; |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| void |
| hsg_merge_levels_init_1(struct hsg_merge * const merge, uint32_t const warps, uint32_t const level, uint32_t const offset) |
| { |
| uint32_t const even_odd = warps & 1; |
| |
| merge->levels[level].evenodds[even_odd]++; |
| merge->levels[level].networks[even_odd] = warps; |
| |
| if (warps == 1) |
| return; |
| |
| merge->levels[level].active.b64 |= BITS_TO_MASK_AT_64(warps,offset); |
| |
| uint32_t const count = merge->levels[level].count++; |
| uint32_t const index = (1 << level) + count; |
| uint32_t const bit = 1 << count; |
| |
| merge->levels[level].evenodd_masks[even_odd] |= bit; |
| |
| if (count > 0) |
| { |
| // offset from network to left of this network |
| uint32_t const diff = offset - merge->offsets[index-1]; |
| |
| uint32_t const diff_0 = merge->levels[level].diffs[0]; |
| uint32_t const diff_1 = merge->levels[level].diffs[1]; |
| |
| uint32_t diff_idx = UINT32_MAX; |
| |
| if ((diff_0 == 0) || (diff_0 == diff)) { |
| diff_idx = 0; |
| } else if ((diff_1 == 0) || (diff_1 == diff)) { |
| diff_idx = 1; |
| } else { |
| fprintf(stderr, "*** MORE THAN TWO DIFFS ***\n"); |
| exit(-1); |
| } |
| |
| merge->levels[level].diffs [diff_idx] = diff; |
| merge->levels[level].diff_masks[diff_idx] |= 1 << (count-1); |
| } |
| |
| merge->networks[index] = warps; |
| merge->offsets [index] = offset; |
| |
| uint32_t const l = (warps+1)/2; // lower/larger on left |
| uint32_t const r = (warps+0)/2; // higher/smaller on right |
| |
| hsg_merge_levels_init_1(merge,l,level+1,offset); |
| hsg_merge_levels_init_1(merge,r,level+1,offset+l); |
| } |
| |
| static |
| void |
| hsg_merge_levels_debug(struct hsg_merge * const merge) |
| { |
| for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++) |
| { |
| uint32_t count = merge->levels[level].count; |
| |
| if (count == 0) |
| break; |
| |
| fprintf(stderr, |
| "%-4u : %016llX \n", |
| count, |
| merge->levels[level].active.b64); |
| |
| fprintf(stderr, |
| "%-4u : %08X (%2u)\n" |
| "%-4u : %08X (%2u)\n", |
| merge->levels[level].diffs[0], |
| merge->levels[level].diff_masks[0], |
| __popcnt(merge->levels[level].diff_masks[0]), |
| merge->levels[level].diffs[1], |
| merge->levels[level].diff_masks[1], |
| __popcnt(merge->levels[level].diff_masks[1])); |
| |
| fprintf(stderr, |
| "EVEN : %08X (%2u)\n" |
| "ODD : %08X (%2u)\n", |
| merge->levels[level].evenodd_masks[0], |
| __popcnt(merge->levels[level].evenodd_masks[0]), |
| merge->levels[level].evenodd_masks[1], |
| __popcnt(merge->levels[level].evenodd_masks[1])); |
| |
| for (uint32_t ii=0; ii<2; ii++) |
| { |
| if (merge->levels[level].networks[ii] > 1) |
| { |
| fprintf(stderr, |
| "%-4s : ( %2u x %2u )\n", |
| (ii == 0) ? "EVEN" : "ODD", |
| merge->levels[level].evenodds[ii], |
| merge->levels[level].networks[ii]); |
| } |
| } |
| |
| uint32_t index = 1 << level; |
| |
| while (count-- > 0) |
| { |
| fprintf(stderr, |
| "[ %2u %2u ] ", |
| merge->offsets [index], |
| merge->networks[index]); |
| |
| index += 1; |
| } |
| |
| fprintf(stderr,"\n\n"); |
| } |
| } |
| |
| static |
| void |
| hsg_merge_levels_hint(struct hsg_merge * const merge, bool const autotune) |
| { |
| // clamp against merge levels |
| for (uint32_t level=0; level<MERGE_LEVELS_MAX_LOG2; level++) |
| { |
| // max network |
| uint32_t const n_max = max(merge->levels[level].networks[0], |
| merge->levels[level].networks[1]); |
| |
| if (n_max <= (merge->rows_bs + hsg_config.thread.xtra)) |
| break; |
| |
| if (autotune) |
| { |
| hsg_config.thread.xtra = n_max - merge->rows_bs; |
| |
| uint32_t const r_total = hsg_config.thread.regs + hsg_config.thread.xtra; |
| uint32_t const r_limit = (hsg_config.type.words == 1) ? 120 : 58; |
| |
| if (r_total <= r_limit) |
| { |
| fprintf(stderr,"autotune: %u + %u\n", |
| hsg_config.thread.regs, |
| hsg_config.thread.xtra); |
| break; |
| } |
| else |
| { |
| fprintf(stderr,"skipping autotune: %u + %u > %u\n", |
| hsg_config.thread.regs, |
| hsg_config.thread.xtra, |
| r_limit); |
| exit(-1); |
| } |
| } |
| |
| fprintf(stderr,"*** HINT *** Try extra registers: %u\n", |
| n_max - merge->rows_bs); |
| |
| exit(-1); |
| } |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_op(struct hsg_op * ops, struct hsg_op const opcode) |
| { |
| hsg_op_type_counts[opcode.type] += 1; |
| |
| *ops = opcode; |
| |
| return ops+1; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_exit(struct hsg_op * ops) |
| { |
| return hsg_op(ops,EXIT()); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_end(struct hsg_op * ops) |
| { |
| return hsg_op(ops,END()); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_begin(struct hsg_op * ops) |
| { |
| return hsg_op(ops,BEGIN()); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_else(struct hsg_op * ops) |
| { |
| return hsg_op(ops,ELSE()); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_network_copy(struct hsg_op * ops, |
| struct hsg_network const * const nets, |
| uint32_t const idx, |
| uint32_t const prefix) |
| { |
| uint32_t const len = nets[idx].length; |
| struct hsg_op const * const cxa = nets[idx].network; |
| |
| for (uint32_t ii=0; ii<len; ii++) |
| { |
| const struct hsg_op * const cx = cxa + ii; |
| |
| ops = hsg_op(ops,CMP_XCHG(cx->a,cx->b,prefix)); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_sort(struct hsg_op * ops) |
| { |
| uint32_t const idx = hsg_config.thread.regs / 2 - 1; |
| |
| return hsg_network_copy(ops,hsg_networks_sorting,idx,UINT32_MAX); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge_prefix(struct hsg_op * ops, uint32_t const network, uint32_t const prefix) |
| { |
| if (network <= 1) |
| return ops; |
| |
| return hsg_network_copy(ops,hsg_networks_merging,network-2,prefix); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge(struct hsg_op * ops, uint32_t const network) |
| { |
| return hsg_thread_merge_prefix(ops,network,UINT32_MAX); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge_offset_prefix(struct hsg_op * ops, uint32_t const offset, uint32_t const network, uint32_t const prefix) |
| { |
| if (network <= 1) |
| return ops; |
| |
| uint32_t const idx = network - 2; |
| uint32_t const len = hsg_networks_merging[idx].length; |
| struct hsg_op const * const cxa = hsg_networks_merging[idx].network; |
| |
| for (uint32_t ii=0; ii<len; ii++) |
| { |
| struct hsg_op const * const cx = cxa + ii; |
| |
| ops = hsg_op(ops,CMP_XCHG(offset + cx->a,offset + cx->b,prefix)); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge_offset(struct hsg_op * ops, uint32_t const offset, uint32_t const network) |
| { |
| return hsg_thread_merge_offset_prefix(ops,offset,network,UINT32_MAX); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge_left_right_prefix(struct hsg_op * ops, uint32_t const left, uint32_t const right, uint32_t const prefix) |
| { |
| for (uint32_t l=left,r=left+1; r<=left+right; l--,r++) |
| { |
| ops = hsg_op(ops,CMP_XCHG(l,r,prefix)); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_thread_merge_left_right(struct hsg_op * ops, uint32_t const left, uint32_t const right) |
| { |
| return hsg_thread_merge_left_right_prefix(ops,left,right,UINT32_MAX); |
| } |
| |
| static |
| struct hsg_op * |
| hsg_warp_half_network(struct hsg_op * ops) |
| { |
| uint32_t const n = hsg_config.thread.regs; |
| |
| for (uint32_t r=1; r<=n; r++) |
| ops = hsg_op(ops,CMP_HALF(r-1,r)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_warp_half_downto(struct hsg_op * ops, uint32_t h) |
| { |
| // |
| // *** from h: downto[f/2,1) |
| // **** lane_half(h) |
| // |
| for (; h > 1; h/=2) |
| { |
| ops = hsg_begin(ops); |
| |
| ops = hsg_op(ops,WARP_HALF(h)); |
| ops = hsg_warp_half_network(ops); |
| |
| ops = hsg_end(ops); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_warp_flip_network(struct hsg_op * ops) |
| { |
| uint32_t const n = hsg_config.thread.regs; |
| |
| for (uint32_t r=1; r<=n/2; r++) |
| ops = hsg_op(ops,CMP_FLIP(r-1,r,n+1-r)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_warp_flip(struct hsg_op * ops, uint32_t f) |
| { |
| ops = hsg_begin(ops); |
| |
| ops = hsg_op(ops,WARP_FLIP(f)); |
| ops = hsg_warp_flip_network(ops); |
| |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_bx_warp_load(struct hsg_op * ops, const int32_t vin_or_vout) |
| { |
| uint32_t const n = hsg_config.thread.regs; |
| |
| for (uint32_t r=1; r<=n; r++) |
| ops = hsg_op(ops,BX_REG_GLOBAL_LOAD(r,vin_or_vout)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_bx_warp_store(struct hsg_op * ops) |
| { |
| uint32_t const n = hsg_config.thread.regs; |
| |
| for (uint32_t r=1; r<=n; r++) |
| ops = hsg_op(ops,BX_REG_GLOBAL_STORE(r)); |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_warp_transpose(struct hsg_op * ops) |
| { |
| // func proto |
| ops = hsg_op(ops,TRANSPOSE_KERNEL_PROTO()); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // preamble |
| ops = hsg_op(ops,TRANSPOSE_KERNEL_PREAMBLE()); |
| |
| // load |
| ops = hsg_bx_warp_load(ops,1); // 1 = load from vout[] |
| |
| // emit transpose blend and remap macros ... |
| ops = hsg_op(ops,TRANSPOSE_KERNEL_BODY()); |
| |
| // ... done! |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_warp_half(struct hsg_op * ops, uint32_t const h) |
| { |
| // |
| // *** from h: downto[f/2,1) |
| // **** lane_half(h) |
| // *** thread_merge |
| // |
| ops = hsg_warp_half_downto(ops,h); |
| ops = hsg_thread_merge(ops,hsg_config.thread.regs); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_warp_merge(struct hsg_op * ops) |
| { |
| // |
| // * from f: upto[2,warp.lanes] |
| // ** lane_flip(f) |
| // *** from h: downto[f/2,1) |
| // **** lane_half(h) |
| // *** thread_merge |
| // |
| uint32_t const level = hsg_config.warp.lanes; |
| |
| for (uint32_t f=2; f<=level; f*=2) |
| { |
| ops = hsg_warp_flip(ops,f); |
| ops = hsg_warp_half(ops,f/2); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bc_half_merge_level(struct hsg_op * ops, |
| struct hsg_merge const * const merge, |
| uint32_t const r_lo, |
| uint32_t const s_count) |
| { |
| // guaranteed to be an even network |
| uint32_t const net_even = merge->levels[0].networks[0]; |
| |
| // min of warps in block and remaining horizontal rows |
| uint32_t const active = min(s_count, net_even); |
| |
| // conditional on blockIdx.x |
| if (active < merge->warps) |
| ops = hsg_op(ops,BX_MERGE_H_PRED(active)); // FIXME BX_MERGE |
| |
| // body begin |
| ops = hsg_begin(ops); |
| |
| // scale for min block |
| uint32_t const scale = net_even >= hsg_config.block.warps_min ? 1 : hsg_config.block.warps_min / net_even; |
| |
| // loop if more smem rows than warps |
| for (uint32_t rr=0; rr<s_count; rr+=active) |
| { |
| // body begin |
| ops = hsg_begin(ops); |
| |
| // skip down slab |
| uint32_t const gmem_base = r_lo - 1 + rr; |
| |
| // load registers horizontally -- striding across slabs |
| for (uint32_t ll=1; ll<=net_even; ll++) |
| ops = hsg_op(ops,BC_REG_GLOBAL_LOAD_LEFT(ll,gmem_base+(ll-1)*hsg_config.thread.regs,0)); |
| |
| // merge all registers |
| ops = hsg_thread_merge_prefix(ops,net_even,0); |
| |
| // if we're looping then there is a base |
| uint32_t const smem_base = rr * net_even * scale; |
| |
| // store all registers |
| for (uint32_t ll=1; ll<=net_even; ll++) |
| ops = hsg_op(ops,BX_REG_SHARED_STORE_LEFT(ll,smem_base+ll-1,0)); |
| |
| // body end |
| ops = hsg_end(ops); |
| } |
| |
| // body end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_bc_half_merge(struct hsg_op * ops, struct hsg_merge const * const merge) |
| { |
| // |
| // will only be called with merge->warps >= 2 |
| // |
| uint32_t const warps = max(merge->warps,hsg_config.block.warps_min); |
| |
| // guaranteed to be an even network |
| uint32_t const net_even = merge->levels[0].networks[0]; |
| |
| // set up left SMEM pointer |
| ops = hsg_op(ops,BC_MERGE_H_PREAMBLE(merge->index)); |
| |
| // trim to number of warps in block -- FIXME -- try make this a |
| // multiple of local processor count (Intel = 8, NVIDIA = 4) |
| uint32_t const s_max = merge->rows_bc; |
| |
| // for all the registers |
| for (uint32_t r_lo = 1; r_lo <= hsg_config.thread.regs; r_lo += s_max) |
| { |
| // compute store count |
| uint32_t const r_rem = hsg_config.thread.regs + 1 - r_lo; |
| uint32_t const s_count = min(s_max,r_rem); |
| |
| // block sync -- can skip if first |
| if (r_lo > 1) |
| ops = hsg_op(ops,BLOCK_SYNC()); |
| |
| // merge loop |
| ops = hsg_bc_half_merge_level(ops,merge,r_lo,s_count); |
| |
| // block sync |
| ops = hsg_op(ops,BLOCK_SYNC()); |
| |
| // load rows from shared |
| for (uint32_t c=0; c<s_count; c++) |
| ops = hsg_op(ops,BC_REG_SHARED_LOAD_V(warps,r_lo+c,c)); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bs_flip_merge_level(struct hsg_op * ops, |
| struct hsg_merge const * const merge, |
| uint32_t const level, |
| uint32_t const s_pairs) |
| { |
| // |
| // Note there are a number of ways to flip merge these warps. There |
| // is a magic number in the merge structure that indicates which |
| // warp to activate as well as what network size to invoke. |
| // |
| // This more complex scheme was used in the past. |
| // |
| // The newest scheme is far dumber/simpler and simply directs a warp |
| // to gather up the network associated with a row and merge them. |
| // |
| // This scheme may use more registers per thread but not all |
| // compilers are high quality. |
| // |
| // If there are more warps than smem row pairs to merge then we |
| // disable the spare warps. |
| // |
| // If there are more row pairs than warps then each warp works on |
| // an equal number of rows. |
| // |
| // Note that it takes two warps to flip merge two smem rows. |
| // |
| // FIXME -- We may want to apply the warp smem "mod" value here to |
| // attempt to balance the load>merge>store operations across the |
| // multiprocessor cores. |
| // |
| // FIXME -- the old scheme attempted to keep all the warps active |
| // but the iteration logic was more complex. See 2016 checkins. |
| // |
| |
| // where are we in computed merge? |
| uint32_t const count = merge->levels[level].count; |
| uint32_t const index = 1 << level; |
| |
| uint32_t s_rows = s_pairs * 2; |
| uint32_t base = 0; |
| |
| while (s_rows > 0) |
| { |
| uint32_t active = merge->warps; |
| |
| // disable warps if necessary |
| if (merge->warps > s_rows) { |
| active = s_rows; |
| ops = hsg_op(ops,BX_MERGE_H_PRED(active)); |
| } |
| |
| // body begin |
| ops = hsg_begin(ops); |
| |
| // how many equal number of rows to merge? |
| uint32_t loops = s_rows / active; |
| |
| // decrement |
| s_rows -= loops * active; |
| |
| for (uint32_t ss=0; ss<loops; ss++) |
| { |
| // load all registers |
| for (uint32_t ii=0; ii<count; ii++) |
| { |
| // body begin |
| ops = hsg_begin(ops); |
| |
| uint32_t const offset = merge->offsets [index+ii]; |
| uint32_t const network = merge->networks[index+ii]; |
| uint32_t const lo = (network + 1) / 2; |
| |
| for (uint32_t ll=1; ll<=lo; ll++) |
| ops = hsg_op(ops,BS_REG_SHARED_LOAD_LEFT(ll,base+offset+ll-1,ii)); |
| |
| for (uint32_t rr=lo+1; rr<=network; rr++) |
| ops = hsg_op(ops,BS_REG_SHARED_LOAD_RIGHT(rr,base+offset+rr-1,ii)); |
| |
| // compare left and right |
| ops = hsg_thread_merge_left_right_prefix(ops,lo,network-lo,ii); |
| |
| // right merging network |
| ops = hsg_thread_merge_offset_prefix(ops,lo,network-lo,ii); |
| |
| // left merging network |
| ops = hsg_thread_merge_prefix(ops,lo,ii); |
| |
| for (uint32_t ll=1; ll<=lo; ll++) |
| ops = hsg_op(ops,BX_REG_SHARED_STORE_LEFT(ll,base+offset+ll-1,ii)); |
| |
| for (uint32_t rr=lo+1; rr<=network; rr++) |
| ops = hsg_op(ops,BS_REG_SHARED_STORE_RIGHT(rr,base+offset+rr-1,ii)); |
| |
| // body end |
| ops = hsg_end(ops); |
| } |
| |
| base += active * merge->warps; |
| } |
| |
| // body end |
| ops = hsg_end(ops); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_bs_flip_merge(struct hsg_op * ops, struct hsg_merge const * const merge) |
| { |
| // set up horizontal smem pointer |
| ops = hsg_op(ops,BS_MERGE_H_PREAMBLE(merge->index)); |
| |
| // begin merge |
| uint32_t level = MERGE_LEVELS_MAX_LOG2; |
| |
| while (level-- > 0) |
| { |
| uint32_t const count = merge->levels[level].count; |
| |
| if (count == 0) |
| continue; |
| |
| uint32_t const r_mid = hsg_config.thread.regs/2 + 1; |
| uint32_t const s_pairs_max = merge->rows_bs/2; // this is warp mod |
| |
| // for all the registers |
| for (uint32_t r_lo=1; r_lo<r_mid; r_lo+=s_pairs_max) |
| { |
| uint32_t r_hi = hsg_config.thread.regs + 1 - r_lo; |
| |
| // compute store count |
| uint32_t const s_pairs = min(s_pairs_max,r_mid - r_lo); |
| |
| // store rows to shared |
| for (uint32_t c=0; c<s_pairs; c++) |
| { |
| ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_lo+c,c*2+0)); |
| ops = hsg_op(ops,BS_REG_SHARED_STORE_V(merge->index,r_hi-c,c*2+1)); |
| } |
| |
| // block sync |
| ops = hsg_op(ops,BLOCK_SYNC()); |
| |
| // merge loop |
| ops = hsg_bs_flip_merge_level(ops,merge,level,s_pairs); |
| |
| // block sync |
| ops = hsg_op(ops,BLOCK_SYNC()); |
| |
| // load rows from shared |
| for (uint32_t c=0; c<s_pairs; c++) |
| { |
| ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_lo+c,c*2+0)); |
| ops = hsg_op(ops,BS_REG_SHARED_LOAD_V(merge->index,r_hi-c,c*2+1)); |
| } |
| } |
| |
| // conditionally clean -- no-op if equal to number of warps/block |
| if (merge->levels[level].active.b64 != BITS_TO_MASK_64(merge->warps)) |
| ops = hsg_op(ops,BS_ACTIVE_PRED(merge->index,level)); |
| |
| // clean warp |
| ops = hsg_begin(ops); |
| ops = hsg_warp_half(ops,hsg_config.warp.lanes); |
| ops = hsg_end(ops); |
| } |
| |
| return ops; |
| } |
| |
| /* |
| |
| // |
| // DELETE ME WHEN READY |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bs_flip_merge_all(struct hsg_op * ops, const struct hsg_merge * const merge) |
| { |
| for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++) |
| { |
| const struct hsg_merge* const m = merge + merge_idx; |
| |
| if (m->warps < 2) |
| break; |
| |
| ops = hsg_op(ops,BS_FRAC_PRED(merge_idx,m->warps)); |
| ops = hsg_begin(ops); |
| ops = hsg_bs_flip_merge(ops,m); |
| ops = hsg_end(ops); |
| } |
| |
| return ops; |
| } |
| */ |
| |
| // |
| // GENERATE SORT KERNEL |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bs_sort(struct hsg_op * ops, const struct hsg_merge * const merge) |
| { |
| // func proto |
| ops = hsg_op(ops,BS_KERNEL_PROTO(merge->index)); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // shared declare |
| ops = hsg_op(ops,BS_KERNEL_PREAMBLE(merge->index)); |
| |
| // load |
| ops = hsg_bx_warp_load(ops,0); // 0 = load from vin[] |
| |
| // thread sorting network |
| ops = hsg_thread_sort(ops); |
| |
| // warp merging network |
| ops = hsg_warp_merge(ops); |
| |
| // slab merging network |
| if (merge->warps > 1) |
| ops = hsg_bs_flip_merge(ops,merge); |
| |
| // store |
| ops = hsg_bx_warp_store(ops); |
| |
| // end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE SORT KERNELS |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bs_sort_all(struct hsg_op * ops) |
| { |
| for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++) |
| { |
| const struct hsg_merge* const m = hsg_merge + merge_idx; |
| |
| if (m->warps == 0) |
| break; |
| |
| ops = hsg_bs_sort(ops,m); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE CLEAN KERNEL FOR A POWER-OF-TWO |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bc_clean(struct hsg_op * ops, const struct hsg_merge * const merge) |
| { |
| // func proto |
| ops = hsg_op(ops,BC_KERNEL_PROTO(merge->index)); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // shared declare |
| ops = hsg_op(ops,BC_KERNEL_PREAMBLE(merge->index)); |
| |
| // if warps == 1 then smem isn't used for merging |
| if (merge->warps == 1) |
| { |
| // load slab directly |
| ops = hsg_bx_warp_load(ops,1); // load from vout[] |
| } |
| else |
| { |
| // block merging network -- strided load of slabs |
| ops = hsg_bc_half_merge(ops,merge); |
| } |
| |
| // clean warp |
| ops = hsg_begin(ops); |
| ops = hsg_warp_half(ops,hsg_config.warp.lanes); |
| ops = hsg_end(ops); |
| |
| // store |
| ops = hsg_bx_warp_store(ops); |
| |
| // end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE CLEAN KERNELS |
| // |
| |
| static |
| struct hsg_op * |
| hsg_bc_clean_all(struct hsg_op * ops) |
| { |
| for (uint32_t merge_idx=0; merge_idx<MERGE_LEVELS_MAX_LOG2; merge_idx++) |
| { |
| const struct hsg_merge* const m = hsg_merge + merge_idx; |
| |
| if (m->warps == 0) |
| break; |
| |
| // only generate pow2 clean kernels less than or equal to max |
| // warps in block with the assumption that we would've generated |
| // a wider sort kernel if we could've so a wider clean kernel |
| // isn't a feasible size |
| if (!is_pow2_u32(m->warps)) |
| continue; |
| |
| ops = hsg_bc_clean(ops,m); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE FLIP MERGE KERNEL |
| // |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_load_left(struct hsg_op * ops, uint32_t const n) |
| { |
| uint32_t const mid = n/2; |
| |
| for (uint32_t r=1; r<=mid; r++) |
| ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_LEFT(r,r-1)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_store_left(struct hsg_op * ops, uint32_t const n) |
| { |
| uint32_t const mid = n/2; |
| |
| for (uint32_t r=mid; r>=1; r--) |
| ops = hsg_op(ops,FM_REG_GLOBAL_STORE_LEFT(r,r-1)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_load_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) |
| { |
| uint32_t const mid = n / 2; |
| uint32_t const first = mid + 1; |
| uint32_t const last = mid + span_right; |
| |
| for (uint32_t r=first; r<=last; r++) |
| ops = hsg_op(ops,FM_REG_GLOBAL_LOAD_RIGHT(r,r-first)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_store_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) |
| { |
| uint32_t const mid = n / 2; |
| uint32_t const first = mid + 1; |
| uint32_t const last = mid + span_right; |
| |
| for (uint32_t r=last; r>=first; r--) |
| ops = hsg_op(ops,FM_REG_GLOBAL_STORE_RIGHT(r,r-first)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_merge_right(struct hsg_op * ops, uint32_t const n, uint32_t const span_right) |
| { |
| // conditional |
| ops = hsg_op(ops,FM_MERGE_RIGHT_PRED(n/2,span_right)); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // load |
| ops = hsg_fm_thread_load_right(ops,n,span_right); |
| |
| // compare left and right |
| ops = hsg_thread_merge_left_right(ops,n/2,span_right); |
| |
| // right merging network |
| ops = hsg_thread_merge_offset(ops,n/2,span_right); |
| |
| // store |
| ops = hsg_fm_thread_store_right(ops,n,span_right); |
| |
| // end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_thread_merge_right_all(struct hsg_op * ops, uint32_t const span) |
| { |
| ops = hsg_fm_thread_merge_right(ops,span,span/2); |
| |
| for (uint32_t span_pow2 = pow2_ru_u32(span) / 4; span_pow2 >= 1; span_pow2 /= 2) |
| { |
| ops = hsg_fm_thread_merge_right(ops,span,span_pow2); |
| } |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_fm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const fm_scale) |
| { |
| // func proto |
| ops = hsg_op(ops,FM_KERNEL_PROTO(level,fm_scale)); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // shared declare |
| ops = hsg_op(ops,FM_KERNEL_PREAMBLE(span,fm_scale)); |
| |
| // load |
| ops = hsg_fm_thread_load_left(ops,span); |
| |
| // right merging network |
| ops = hsg_fm_thread_merge_right_all(ops,span); |
| |
| // left merging network |
| ops = hsg_thread_merge(ops,span/2); |
| |
| // store |
| ops = hsg_fm_thread_store_left(ops,span); |
| |
| // end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE HALF MERGE KERNELS |
| // |
| |
| static |
| struct hsg_op * |
| hsg_hm_thread_load(struct hsg_op * ops, uint32_t const n) |
| { |
| for (uint32_t r=1; r<=n; r++) |
| ops = hsg_op(ops,HM_REG_GLOBAL_LOAD(r,r-1)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_hm_thread_store(struct hsg_op * ops, uint32_t const n) |
| { |
| for (uint32_t r=n; r>=1; r--) |
| ops = hsg_op(ops,HM_REG_GLOBAL_STORE(r,r-1)); |
| |
| return ops; |
| } |
| |
| static |
| struct hsg_op * |
| hsg_hm_merge(struct hsg_op * ops, uint32_t const level, uint32_t const span, uint32_t const hm_scale) |
| { |
| // func proto |
| ops = hsg_op(ops,HM_KERNEL_PROTO(level,level-msb_idx_u32(span))); |
| |
| // begin |
| ops = hsg_begin(ops); |
| |
| // declarations |
| ops = hsg_op(ops,HM_KERNEL_PREAMBLE(span,hm_scale)); |
| |
| // load |
| ops = hsg_hm_thread_load(ops,span); |
| |
| // thread merging network |
| ops = hsg_thread_merge(ops,span); |
| |
| // store |
| ops = hsg_hm_thread_store(ops,span); |
| |
| // end |
| ops = hsg_end(ops); |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_fm_merge_level(struct hsg_op * ops, uint32_t const level) |
| { |
| uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); |
| uint32_t const bc_max_log2 = msb_idx_u32(bc_max); |
| |
| uint32_t const fm_level = (level <= bc_max_log2) ? hsg_config.merge.flip.lo : min(level - bc_max_log2,hsg_config.merge.flip.hi); |
| uint32_t const fm_scale = level - fm_level; |
| |
| ops = hsg_fm_merge(ops, |
| level, |
| hsg_merge[0].warps * (1u << fm_level), |
| fm_scale); |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_op * |
| hsg_hm_merge_level(struct hsg_op * ops, uint32_t const level) |
| { |
| uint32_t const bc_max = pow2_rd_u32(hsg_merge[0].warps); |
| uint32_t const bc_max_log2 = msb_idx_u32(bc_max); |
| |
| uint32_t const fm_log2_max = bc_max_log2 + hsg_config.merge.flip.hi; |
| |
| if (level > fm_log2_max) |
| { |
| uint32_t const down_warps_log2 = level - fm_log2_max; |
| uint32_t const hm_level = max(hsg_config.merge.half.lo,min(hsg_config.merge.half.hi,down_warps_log2)); |
| |
| ops = hsg_hm_merge(ops, |
| level - hsg_config.merge.flip.hi, |
| bc_max * (1u << hm_level), |
| down_warps_log2 - hm_level); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // GENERATE MERGE KERNELS |
| // |
| |
| static |
| struct hsg_op * |
| hsg_xm_merge_all(struct hsg_op * ops) |
| { |
| uint32_t const keys_per_block = hsg_merge[0].warps * hsg_config.warp.lanes * hsg_config.thread.regs; |
| uint32_t const blocks = ((1U << hsg_config.merge.max_log2) + keys_per_block - 1) / keys_per_block; |
| uint32_t const blocks_ru = pow2_ru_u32(blocks); |
| uint32_t const blocks_log2 = msb_idx_u32(blocks_ru); |
| |
| for (uint32_t level=1; level<=blocks_log2; level+=1) |
| { |
| // |
| // GENERATE FLIP MERGE KERNELS |
| // |
| ops = hsg_fm_merge_level(ops,level); |
| |
| // |
| // GENERATE HALF MERGE KERNELS |
| // |
| ops = hsg_hm_merge_level(ops,level); |
| } |
| |
| return ops; |
| } |
| |
| // |
| // |
| // |
| |
| void |
| hsg_target_indent(struct hsg_file * const files, uint32_t const depth) |
| { |
| fprintf(files[HSG_FILE_TYPE_SOURCE].file, |
| "%*s", |
| depth*HSG_INDENT,""); |
| } |
| |
| void |
| hsg_target_debug(struct hsg_file * const files, |
| const struct hsg_merge * const merge, |
| const struct hsg_op * const ops, |
| uint32_t const depth) |
| { |
| |
| hsg_target_indent(files,depth); |
| |
| fprintf(files[HSG_FILE_TYPE_SOURCE].file, |
| "%s\n", |
| hsg_op_type_string[ops->type]); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| struct hsg_file* |
| hsg_files_open(const char * prefix, const char ** suffix) |
| { |
| #define STR_BUF_SIZE 80 |
| |
| struct hsg_file * files = malloc(sizeof(struct hsg_file) * HSG_FILE_TYPE_COUNT); |
| |
| for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++) |
| { |
| char * name = files[ii].name; |
| |
| // save prefix |
| files[ii].prefix = prefix; |
| |
| // build filename |
| strcpy_s(name,STR_BUF_SIZE,prefix); |
| strcat_s(name,STR_BUF_SIZE,suffix[ii]); |
| |
| // open file |
| fopen_s(&files[ii].file,name,"w+"); |
| } |
| |
| return files; |
| } |
| |
| static |
| void |
| hsg_files_close(struct hsg_file * files) |
| { |
| for (int32_t ii=0; ii<HSG_FILE_TYPE_COUNT; ii++) |
| fclose(files[ii].file); |
| } |
| |
| // |
| // |
| // |
| |
| static |
| const struct hsg_op * |
| hsg_op_translate_depth(hsg_target_pfn target_pfn, |
| struct hsg_file * const files, |
| const struct hsg_merge * const merge, |
| const struct hsg_op * ops, |
| uint32_t const depth) |
| { |
| while (ops->type != HSG_OP_TYPE_EXIT) |
| { |
| switch (ops->type) |
| { |
| case HSG_OP_TYPE_END: |
| target_pfn(files,merge,ops,depth-1); |
| return ops + 1; |
| |
| case HSG_OP_TYPE_BEGIN: |
| target_pfn(files,merge,ops,depth); |
| ops = hsg_op_translate_depth(target_pfn,files,merge,ops+1,depth+1); |
| break; |
| |
| default: |
| target_pfn(files,merge,ops++,depth); |
| } |
| } |
| |
| return ops; |
| } |
| |
| static |
| void |
| hsg_op_translate(hsg_target_pfn target_pfn, |
| struct hsg_file * const files, |
| const struct hsg_merge * const merge, |
| const struct hsg_op * ops) |
| { |
| hsg_op_translate_depth(target_pfn,files,merge,ops,0); |
| } |
| |
| // |
| // |
| // |
| |
| int |
| main(int argc, char * argv[]) |
| { |
| // |
| // INIT |
| // |
| for (uint32_t ii=0; ii<=MERGE_LEVELS_MAX_LOG2; ii++) |
| { |
| hsg_merge[ii].index = ii; |
| hsg_merge[ii].warps = 32 / (1u << ii); |
| } |
| |
| // |
| // PROCESS OPTIONS |
| // |
| int32_t arch = 0; |
| int32_t opt = 0; |
| |
| bool quiet = false; |
| bool autotune = false; |
| |
| while ((opt = getopt(argc,argv,"hqa:g:G:s:S:w:b:B:m:M:k:r:x:t:f:F:c:C:z")) != EOF) |
| { |
| switch (opt) |
| { |
| case 'h': |
| fprintf(stderr,"Help goes here...\n"); |
| return -1; |
| |
| case 'q': |
| quiet = true; |
| break; |
| |
| case 'a': |
| arch = atoi(optarg); |
| break; |
| |
| case 'g': |
| hsg_config.block.smem_min = atoi(optarg); |
| break; |
| |
| case 'G': |
| hsg_config.block.smem_quantum = atoi(optarg); |
| break; |
| |
| case 's': |
| hsg_config.block.smem_bs = atoi(optarg); |
| |
| // set smem_bc if not already set |
| if (hsg_config.block.smem_bc == UINT32_MAX) |
| hsg_config.block.smem_bc = hsg_config.block.smem_bs; |
| break; |
| |
| case 'S': |
| hsg_config.block.smem_bc = atoi(optarg); |
| break; |
| |
| case 'w': |
| hsg_config.warp.lanes = atoi(optarg); |
| break; |
| |
| case 'b': |
| // maximum warps in a workgroup / cta / thread block |
| { |
| uint32_t const warps = atoi(optarg); |
| uint32_t const warps_ru_pow2 = pow2_ru_u32(warps); |
| |
| // set warps_max if not already set |
| if (hsg_config.block.warps_max == UINT32_MAX) |
| hsg_config.block.warps_max = warps_ru_pow2; |
| |
| // must always be even |
| if ((warps&1) != 0) |
| { |
| fprintf(stderr,"Error: -b must be even.\n"); |
| exit(-1); |
| } |
| |
| hsg_merge[0].warps = warps; |
| |
| for (uint32_t ii=1; ii<=MERGE_LEVELS_MAX_LOG2; ii++) |
| hsg_merge[ii].warps = warps_ru_pow2 / (1u << ii); |
| } |
| break; |
| |
| case 'B': |
| // maximum warps that can fit in a multiprocessor |
| hsg_config.block.warps_max = atoi(optarg); |
| break; |
| |
| case 'm': |
| // blocks using smem barriers must at least this many warps |
| hsg_config.block.warps_min = atoi(optarg); |
| break; |
| |
| case 'M': |
| // the number of warps necessary to load balance horizontal merging |
| hsg_config.block.warps_mod = atoi(optarg); |
| break; |
| |
| case 'k': |
| hsg_config.merge.max_log2 = atoi(optarg); |
| break; |
| |
| case 'r': |
| { |
| uint32_t const regs = atoi(optarg); |
| |
| if ((regs&1) != 0) |
| { |
| fprintf(stderr,"Error: -r must be even.\n"); |
| exit(-1); |
| } |
| |
| hsg_config.thread.regs = regs; |
| } |
| break; |
| |
| case 'x': |
| hsg_config.thread.xtra = atoi(optarg); |
| break; |
| |
| case 't': |
| hsg_config.type.words = atoi(optarg); |
| break; |
| |
| case 'f': |
| hsg_config.merge.flip.lo = atoi(optarg); |
| break; |
| |
| case 'F': |
| hsg_config.merge.flip.hi = atoi(optarg); |
| break; |
| |
| case 'c': |
| hsg_config.merge.half.lo = atoi(optarg); |
| break; |
| |
| case 'C': |
| hsg_config.merge.half.hi = atoi(optarg); |
| break; |
| |
| case 'z': |
| autotune = true; |
| break; |
| } |
| } |
| |
| // |
| // WHICH ARCH TARGET? |
| // |
| hsg_target_pfn hsg_target_pfn = (arch < HSG_TARGET_PFN_COUNT) ? hsg_target_pfns[arch] : hsg_target_debug; |
| |
| // |
| // OPEN FILES |
| // |
| struct hsg_file * files = hsg_files_open(hsg_target_pfn_string[arch],hsg_file_type_string[arch]); |
| |
| // |
| // INIT F_KEYS |
| // |
| hsg_config_init_shared(); |
| |
| // |
| // INIT MERGE MAGIC |
| // |
| for (uint32_t ii=0; ii<MERGE_LEVELS_MAX_LOG2; ii++) |
| { |
| struct hsg_merge * const merge = hsg_merge + ii; |
| |
| if (merge->warps == 0) |
| break; |
| |
| fprintf(stderr,">>> Generating: %1u %5u %5u %3u %3u ...\n", |
| hsg_config.type.words, |
| hsg_config.block.smem_bs, |
| hsg_config.block.smem_bc, |
| hsg_config.thread.regs, |
| merge->warps); |
| |
| hsg_merge_levels_init_shared(merge); |
| |
| hsg_merge_levels_init_1(merge,merge->warps,0,0); |
| |
| hsg_merge_levels_hint(merge,autotune); |
| |
| // |
| // THESE ARE FOR DEBUG/INSPECTION |
| // |
| |
| if (!quiet) |
| { |
| hsg_merge_levels_debug(merge); |
| } |
| } |
| |
| if (!quiet) |
| fprintf(stderr,"\n\n"); |
| |
| // |
| // |
| // |
| uint32_t const op_count = 1024*1024; // 2^20 ops for now! |
| struct hsg_op * const ops_begin = malloc(op_count * sizeof(*ops_begin)); |
| struct hsg_op * ops = ops_begin; |
| |
| // |
| // APPEND HEADER |
| // |
| ops = hsg_op(ops,FILE_HEADER()); |
| |
| // |
| // GENERATE TRANSPOSE KERNEL |
| // |
| ops = hsg_warp_transpose(ops); |
| |
| // |
| // GENERATE SORT KERNEL |
| // |
| ops = hsg_bs_sort_all(ops); |
| |
| // |
| // GENERATE CLEAN KERNELS |
| // |
| ops = hsg_bc_clean_all(ops); |
| |
| // |
| // GENERATE MERGE KERNELS |
| // |
| ops = hsg_xm_merge_all(ops); |
| |
| // |
| // APPEND FOOTER |
| // |
| ops = hsg_op(ops,FILE_FOOTER()); |
| |
| // |
| // ... WE'RE DONE! |
| // |
| ops = hsg_exit(ops); |
| |
| // |
| // APPLY TARGET TRANSLATOR TO ACCUMULATED OPS |
| // |
| hsg_op_translate(hsg_target_pfn,files,hsg_merge,ops_begin); |
| |
| // |
| // |
| // |
| if (!quiet) |
| hsg_op_debug(); |
| |
| // |
| // |
| // |
| hsg_files_close(files); |
| |
| return 0; |
| } |
| |
| // |
| // |
| // |