block_to_ctile_map.hpp Source File#
block_to_ctile_map.hpp
Go to the documentation of this file.
260struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
549 __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
Definition ck.hpp:268
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
__host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
Definition tensor_description/tensor_adaptor.hpp:245
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
__host__ __device__ constexpr auto make_insert_transform(const UpperIndex &up_idx)
Definition multi_index_transform_helper.hpp:157
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
__host__ __device__ bool DefaultValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim)
Definition block_to_ctile_map.hpp:835
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ constexpr auto CalculateGridSize(index_t M, index_t N, index_t k_split) const
Definition block_to_ctile_map.hpp:982
__device__ constexpr auto CalculateBottomIndex(const TopIdx &) const
Definition block_to_ctile_map.hpp:991
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition block_to_ctile_map.hpp:997
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &) const
Definition block_to_ctile_map.hpp:1004
__host__ __device__ BlockToCTileMap_3DGrid_KSplit()=default
__host__ __device__ uint32_t get_sk_tiles() const
Definition block_to_ctile_map.hpp:1592
MDiv k_iters_per_tile
Definition block_to_ctile_map.hpp:1435
__host__ __device__ uint32_t get_workspace_size(uint32_t acc_element_bytes) const
Definition block_to_ctile_map.hpp:1700
__host__ __device__ uint32_t get_tile_intersections(uint32_t tiles_, const MDiv &equiv_tiles_) const
Definition block_to_ctile_map.hpp:1705
MDiv equiv_tiles_little
Definition block_to_ctile_map.hpp:1437
__device__ uint32_t get_acc_buffer_offset_from_block(uint32_t block_idx_) const
Definition block_to_ctile_map.hpp:1763
uint32_t dp_start_block_idx
Definition block_to_ctile_map.hpp:1431
static constexpr uint32_t KPerBlock
Definition block_to_ctile_map.hpp:1424
static constexpr uint32_t NPerBlock
Definition block_to_ctile_map.hpp:1423
MDiv2 n_tiles
Definition block_to_ctile_map.hpp:1434
static constexpr uint32_t min_k_iters_per_sk_block
Definition block_to_ctile_map.hpp:1421
__host__ __device__ uint32_t get_sk_total_iters() const
Definition block_to_ctile_map.hpp:1585
__host__ __device__ uint32_t get_total_acc_buffers() const
Definition block_to_ctile_map.hpp:1722
__host__ __device__ index_t get_grid_dims() const
Definition block_to_ctile_map.hpp:1599
__device__ uint32_t get_tile_idx(uint32_t iter) const
Definition block_to_ctile_map.hpp:1650
__host__ __device__ uint32_t get_workspace_size_for_semaphore() const
Definition block_to_ctile_map.hpp:1695
__device__ void get_block_itr(uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const
Definition block_to_ctile_map.hpp:1617
uint32_t k_iters_per_big_block
Definition block_to_ctile_map.hpp:1433
uint32_t sk_num_big_blocks
Definition block_to_ctile_map.hpp:1430
MDiv equiv_tiles_big
Definition block_to_ctile_map.hpp:1436
__device__ uint32_t get_acc_buffer_offset_from_tile(uint32_t tile_idx_) const
Definition block_to_ctile_map.hpp:1737
__device__ auto tile_to_spatial(uint32_t tile_idx, uint32_t m, uint32_t n) const
Definition block_to_ctile_map.hpp:1658
__host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const
Definition block_to_ctile_map.hpp:1687
__device__ uint32_t get_current_iter_length(uint32_t iter_start, uint32_t iter_end, uint32_t total_iter_length) const
Definition block_to_ctile_map.hpp:1639
static constexpr uint32_t tile_swizzle_sub_m
Definition block_to_ctile_map.hpp:1425
uint32_t reduction_start_block_idx
Definition block_to_ctile_map.hpp:1432
__host__ __device__ uint32_t get_tiles_cover_sk_block(uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const
Definition block_to_ctile_map.hpp:1715
__host__ __device__ BlockToCTileMap_GemmStreamK_v2(uint32_t m, uint32_t n, uint32_t k, uint32_t grid_size=1, uint32_t streamk_sel=1, StreamKReductionStrategy reduction_strategy_=StreamKReductionStrategy::Atomic)
Definition block_to_ctile_map.hpp:1441
uint32_t sk_num_blocks
Definition block_to_ctile_map.hpp:1429
__device__ uint32_t get_block_idx() const
Definition block_to_ctile_map.hpp:1610
__device__ void get_tile_idx_with_offset(uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const
Definition block_to_ctile_map.hpp:1653
StreamKReductionStrategy reduction_strategy
Definition block_to_ctile_map.hpp:1438
__host__ static __device__ constexpr index_t CalculateGridSize(index_t M, index_t N)
Definition block_to_ctile_map.hpp:1578
static constexpr uint32_t MPerBlock
Definition block_to_ctile_map.hpp:1422
uint32_t k_iters_per_big_block
Definition block_to_ctile_map.hpp:1036
__host__ __device__ uint32_t get_workspace_size(uint32_t acc_element_bytes) const
Definition block_to_ctile_map.hpp:1327
__device__ uint32_t get_acc_buffer_offset_from_block(uint32_t block_idx_) const
Definition block_to_ctile_map.hpp:1390
__host__ __device__ uint32_t get_sk_total_iters() const
Definition block_to_ctile_map.hpp:1213
__host__ __device__ uint32_t get_tiles_cover_sk_block(uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const
Definition block_to_ctile_map.hpp:1342
static constexpr uint32_t MPerBlock
Definition block_to_ctile_map.hpp:1024
uint32_t dp_start_block_idx
Definition block_to_ctile_map.hpp:1034
__host__ __device__ uint32_t get_sk_tiles() const
Definition block_to_ctile_map.hpp:1220
static constexpr uint32_t KPerBlock
Definition block_to_ctile_map.hpp:1026
__host__ __device__ uint32_t get_total_acc_buffers() const
Definition block_to_ctile_map.hpp:1349
__device__ uint32_t get_current_iter_length(uint32_t iter_start, uint32_t iter_end, uint32_t total_iter_length) const
Definition block_to_ctile_map.hpp:1266
static constexpr uint32_t NPerBlock
Definition block_to_ctile_map.hpp:1025
__device__ uint32_t get_acc_buffer_offset_from_tile(uint32_t tile_idx_) const
Definition block_to_ctile_map.hpp:1364
uint32_t reduction_start_block_idx
Definition block_to_ctile_map.hpp:1035
__host__ __device__ uint32_t get_workspace_size_for_acc(uint32_t acc_element_bytes) const
Definition block_to_ctile_map.hpp:1314
MDiv k_iters_per_tile
Definition block_to_ctile_map.hpp:1038
__device__ void get_tile_idx_with_offset(uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const
Definition block_to_ctile_map.hpp:1280
static constexpr uint32_t tile_swizzle_sub_m
Definition block_to_ctile_map.hpp:1028
BlockToCTileMap_GemmStreamK(uint32_t m, uint32_t n, uint32_t k, uint32_t num_cu, uint32_t occupancy, uint32_t sk_blocks=0xffffffff)
Definition block_to_ctile_map.hpp:1046
static constexpr StreamKReductionStrategy ReductionStrategy
Definition block_to_ctile_map.hpp:1027
__device__ auto tile_to_spatial(uint32_t tile_idx, uint32_t m, uint32_t n) const
Definition block_to_ctile_map.hpp:1285
__device__ uint32_t get_tile_idx(uint32_t iter) const
Definition block_to_ctile_map.hpp:1277
__host__ __device__ uint32_t get_tile_intersections(uint32_t tiles_, const MDiv &eqav_tiles_) const
Definition block_to_ctile_map.hpp:1332
__device__ uint32_t get_block_idx() const
Definition block_to_ctile_map.hpp:1237
__device__ void get_block_itr(uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const
Definition block_to_ctile_map.hpp:1244
MDiv eqav_tiles_little
Definition block_to_ctile_map.hpp:1040
uint32_t sk_num_blocks
Definition block_to_ctile_map.hpp:1032
MDiv eqav_tiles_big
Definition block_to_ctile_map.hpp:1039
static constexpr uint32_t min_k_iters_per_sk_block
Definition block_to_ctile_map.hpp:1023
uint32_t sk_num_big_blocks
Definition block_to_ctile_map.hpp:1033
__host__ __device__ dim3 get_grid_dims() const
Definition block_to_ctile_map.hpp:1227
__host__ __device__ uint32_t get_workspace_size_for_semaphore() const
Definition block_to_ctile_map.hpp:1322
__host__ static __device__ constexpr index_t CalculateGridSize(index_t M, index_t N)
Definition block_to_ctile_map.hpp:283
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:298
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt()=default
static constexpr auto I1
Definition block_to_ctile_map.hpp:273
__host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition block_to_ctile_map.hpp:292
static constexpr auto I0
Definition block_to_ctile_map.hpp:272
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition block_to_ctile_map.hpp:384
__host__ __device__ BlockToCTileMap_Grouped_M00_N0_M01Adapt(index_t M, index_t N, index_t M01=8)
Definition block_to_ctile_map.hpp:276
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:773
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:755
static constexpr auto I2
Definition block_to_ctile_map.hpp:723
static constexpr auto I0
Definition block_to_ctile_map.hpp:721
__host__ BlockToCTileMap_KSplit_M00_N00_M01_N01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1, index_t N01=1, index_t KSplit=1)
Definition block_to_ctile_map.hpp:728
__host__ BlockToCTileMap_KSplit_M00_N00_M01_N01()=default
static constexpr auto I3
Definition block_to_ctile_map.hpp:724
static constexpr auto I1
Definition block_to_ctile_map.hpp:722
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition block_to_ctile_map.hpp:764
__host__ __device__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:741
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:556
__host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=8, index_t KSplit=1)
Definition block_to_ctile_map.hpp:549
__host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt()=default
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:567
static constexpr auto I0
Definition block_to_ctile_map.hpp:542
static constexpr auto I1
Definition block_to_ctile_map.hpp:543
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &) const
Definition block_to_ctile_map.hpp:600
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition block_to_ctile_map.hpp:594
static constexpr auto I2
Definition block_to_ctile_map.hpp:544
static constexpr auto I3
Definition block_to_ctile_map.hpp:545
__host__ __device__ BlockToCTileMap_M00_N00_M01_N01()=default
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:646
__host__ __device__ BlockToCTileMap_M00_N00_M01_N01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1, index_t N01=1)
Definition block_to_ctile_map.hpp:625
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:661
static constexpr auto I0
Definition block_to_ctile_map.hpp:618
static constexpr auto I3
Definition block_to_ctile_map.hpp:621
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:632
static constexpr auto I1
Definition block_to_ctile_map.hpp:619
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition block_to_ctile_map.hpp:652
static constexpr auto I2
Definition block_to_ctile_map.hpp:620
__host__ __device__ constexpr bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition block_to_ctile_map.hpp:246
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(index_t M, index_t N, index_t M01=8)
Definition block_to_ctile_map.hpp:138
__host__ static __device__ constexpr index_t CalculateGridSize(index_t M, index_t N)
Definition block_to_ctile_map.hpp:157
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt & operator=(const BlockToCTileMap_M00_N0_M01Adapt &)=default
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:179
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(BlockToCTileMap_M00_N0_M01Adapt &&)=default
static constexpr auto I0
Definition block_to_ctile_map.hpp:123
static __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n)
Definition block_to_ctile_map.hpp:166
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt & operator=(BlockToCTileMap_M00_N0_M01Adapt &&)=default
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=8)
Definition block_to_ctile_map.hpp:150
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt()=default
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(const BlockToCTileMap_M00_N0_M01Adapt &)=default
__host__ __device__ constexpr bool CheckValidity(const CGridDesc_M_N &) const
Definition block_to_ctile_map.hpp:173
static constexpr auto I1
Definition block_to_ctile_map.hpp:124
Definition block_to_ctile_map.hpp:261
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01()=default
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:66
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:51
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01(const CGridDesc_M_N &c_grid_desc_m_n, index_t M01=1)
Definition block_to_ctile_map.hpp:32
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:38
__host__ __device__ constexpr bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition block_to_ctile_map.hpp:57
__host__ bool CheckValidity(const CGridDesc_M_N &) const
Definition block_to_ctile_map.hpp:451
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt()=default
static __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n)
Definition block_to_ctile_map.hpp:445
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt & operator=(BlockToCTileMap_N00_M0_N01Adapt &&)=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(const CGridDesc_M_N &c_grid_desc_m_n, index_t N01=8)
Definition block_to_ctile_map.hpp:429
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt & operator=(const BlockToCTileMap_N00_M0_N01Adapt &)=default
__host__ __device__ bool ValidCTileIndex(const CTileIdx &, const CTileDim &) const
Definition block_to_ctile_map.hpp:525
__host__ static __device__ constexpr index_t CalculateGridSize(index_t M, index_t N)
Definition block_to_ctile_map.hpp:436
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:457
static constexpr auto I1
Definition block_to_ctile_map.hpp:405
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(const BlockToCTileMap_N00_M0_N01Adapt &)=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(BlockToCTileMap_N00_M0_N01Adapt &&)=default
__host__ __device__ BlockToCTileMap_N00_M0_N01Adapt(index_t M, index_t N, index_t N01=8)
Definition block_to_ctile_map.hpp:418
static constexpr auto I0
Definition block_to_ctile_map.hpp:404
Definition block_to_ctile_map.hpp:399
Definition magic_division.hpp:204
Definition magic_division.hpp:162
__host__ __device__ void divmod(uint32_t dividend_, uint32_t "ient_, uint32_t &remainder_) const
Definition magic_division.hpp:194
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition block_to_ctile_map.hpp:940
index_t tile_offset_
Definition block_to_ctile_map.hpp:960
Block2ETileMap block_to_ctile_map_
Definition block_to_ctile_map.hpp:958
__host__ __device__ OffsettedBlockToCTileMap2(UnderlyingBlockToCTileMap block_to_ctile_map, index_t group_offset, index_t tile_offset)
Definition block_to_ctile_map.hpp:923
Block2ETileMap underlying_type
Definition block_to_ctile_map.hpp:921
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:933
index_t group_offset_
Definition block_to_ctile_map.hpp:959
__device__ void UpdateTileOffset(index_t offset)
Definition block_to_ctile_map.hpp:957
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:947
__host__ __device__ constexpr index_t CalculateGridSize(index_t M, index_t N) const
Definition block_to_ctile_map.hpp:952
__host__ __device__ bool ValidCTileIndex(const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) const
Definition block_to_ctile_map.hpp:891
__host__ __device__ constexpr index_t CalculateGridSize(index_t M, index_t N) const
Definition block_to_ctile_map.hpp:909
index_t block_start_
Definition block_to_ctile_map.hpp:915
__host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx &idx_top) const
Definition block_to_ctile_map.hpp:884
__host__ __device__ OffsettedBlockToCTileMap()=default
__host__ constexpr bool CheckValidity(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:898
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N &c_grid_desc_m_n) const
Definition block_to_ctile_map.hpp:904
__host__ __device__ OffsettedBlockToCTileMap(UnderlyingBlockToCTileMap block_to_ctile_map, index_t block_start)
Definition block_to_ctile_map.hpp:876
Block2ETileMap underlying_type
Definition block_to_ctile_map.hpp:873
Block2ETileMap block_to_ctile_map_
Definition block_to_ctile_map.hpp:914