xdlops_gemm.hpp Source File#
xdlops_gemm.hpp
Go to the documentation of this file.
37template <typename ADataType, typename BDataType, typename AScaleDataType, typename BScaleDataType>
835 static constexpr index_t group_size = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
836 static constexpr index_t num_groups_per_blk = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
844 static constexpr index_t k_per_blk = 32; // (is_k_reduction == true) ? KPerXdlops / num_input_blks
859 static constexpr index_t group_size = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
860 static constexpr index_t num_groups_per_blk = 1; // ??? group_size * num_groups_per_blk == num_regs_per_blk
868 static constexpr index_t k_per_blk = 32; // (is_k_reduction == true) ? KPerXdlops / num_input_blks
883 static constexpr index_t group_size = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
884 static constexpr index_t num_groups_per_blk = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
892 static constexpr index_t k_per_blk = 32; // (is_k_reduction == true) ? KPerXdlops / num_input_blks
920 static constexpr index_t group_size = 4; // ??? group_size * num_groups_per_blk == num_regs_per_blk
921 static constexpr index_t num_groups_per_blk = 1; // ??? group_size * num_groups_per_blk == num_regs_per_blk
929 static constexpr index_t k_per_blk = 32; // (is_k_reduction == true) ? KPerXdlops / num_input_blks
1858 MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
1938 MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
2179 // Falls back to single rate instruction on gfx950 if KPack is single rate; no change on gfx942-
2180 // when base_type is either f8_t or bf8_t, additional_type will always be either f8_t or bf8_t,
Definition ck.hpp:268
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
@ wmma_f32_16x16x16_bf16_gfx12
Definition xdlops_gemm.hpp:92
@ wmma_i32_16x16x16_iu8_gfx12
Definition xdlops_gemm.hpp:93
@ mfma_scale_f32_32x32x64f8f6f4
Definition xdlops_gemm.hpp:81
@ wmma_f32_16x16x16_bf8f8_gfx12
Definition xdlops_gemm.hpp:96
@ wmma_f32_16x16x16_f16_gfx12
Definition xdlops_gemm.hpp:91
@ wmma_f32_16x16x16_bf8bf8_gfx12
Definition xdlops_gemm.hpp:97
@ wmma_f32_16x16x16_f8f8_gfx12
Definition xdlops_gemm.hpp:94
@ mfma_scale_f32_16x16x128f8f6f4
Definition xdlops_gemm.hpp:82
@ wmma_f32_16x16x16_f8bf8_gfx12
Definition xdlops_gemm.hpp:95
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
typename packed_type_info< T >::element_type element_type_t
Definition data_type.hpp:408
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
@ wmma_f32_16x16x16_bf16_gfx12
Definition wmma_gemm.hpp:23
@ wmma_f32_16x16x16_bf8f8_gfx12
Definition wmma_gemm.hpp:27
@ wmma_f32_16x16x16_bf8bf8_gfx12
Definition wmma_gemm.hpp:28
@ wmma_f32_16x16x16_f8f8_gfx12
Definition wmma_gemm.hpp:25
@ wmma_f32_16x16x16_f8bf8_gfx12
Definition wmma_gemm.hpp:26
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes o...
Definition xdlops_gemm.hpp:1208
static constexpr index_t GetK1PerXdlops()
Definition xdlops_gemm.hpp:1810
static constexpr auto GetMfma()
static constexpr auto selected_mfma
Definition xdlops_gemm.hpp:1757
static constexpr index_t GetKPerXdlops()
Definition xdlops_gemm.hpp:1804
__host__ __device__ constexpr MfmaSelector()
Definition xdlops_gemm.hpp:1764
Definition utility/sequence.hpp:43
static __device__ constexpr index_t GetWaveSize()
Definition xdlops_gemm.hpp:2012
__host__ static __device__ auto CalculateBThreadOriginDataIndex()
Definition xdlops_gemm.hpp:2134
__host__ static __device__ constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_G_M0_N0_M1_N1_M2_N2 &c_desc_g_m0_n0_m1_n1_m2_n2)
Definition xdlops_gemm.hpp:1971
__host__ static __device__ auto CalculateAThreadOriginDataIndex()
Definition xdlops_gemm.hpp:2112
static __device__ constexpr index_t GetNumXdlops()
Definition xdlops_gemm.hpp:1834
static constexpr bool is_single_rate_mfma
Definition xdlops_gemm.hpp:2182
static __device__ CIndex4D GetBeginOfThreadBlk4D(index_t, index_t)
Definition xdlops_gemm.hpp:2169
static __device__ constexpr index_t GetNumBlks()
Definition xdlops_gemm.hpp:1832
__device__ static __host__ constexpr index_t GetRegSizePerXdlops()
Definition xdlops_gemm.hpp:2007
static __device__ auto GetGfx11InputBlkIdx()
Definition xdlops_gemm.hpp:2090
__host__ static __device__ constexpr auto MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2 &c_desc_m0_n0_m1_n1_m2_n2)
Definition xdlops_gemm.hpp:1938
__device__ void Run(const FloatA &p_a_wave, const ScaleA &a_scale_thread, const FloatB &p_b_wave, const ScaleB &b_scale_thread, FloatC &p_c_thread) const
Definition xdlops_gemm.hpp:2047
__host__ static __device__ constexpr auto MakeCDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3(const CDesc_M0_N0_M1_N1_M2_N2 &c_desc_m0_n0_m1_n1_m2_n2)
Definition xdlops_gemm.hpp:1893
__host__ static __device__ constexpr auto GetCM0M1M2NThreadBlkLengths()
Definition xdlops_gemm.hpp:2203
__device__ void Run(const FloatA &p_a_wave, const FloatB &p_b_wave, FloatC &p_c_thread) const
Definition xdlops_gemm.hpp:2015
__host__ static __device__ constexpr auto MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2 &c_desc_m0_n0_m1_n1_m2_n2)
Definition xdlops_gemm.hpp:1858
static __device__ CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i)
Definition xdlops_gemm.hpp:2156
Definition amd_xdlops.hpp:1202
Definition amd_xdlops.hpp:303
Definition amd_xdlops.hpp:193
Definition amd_xdlops.hpp:70
Definition amd_xdlops.hpp:269
Definition amd_xdlops.hpp:1483
Definition amd_xdlops.hpp:1609
Definition amd_xdlops.hpp:159
Definition amd_xdlops.hpp:1546
Definition amd_xdlops.hpp:1420
Definition amd_xdlops.hpp:207
Definition amd_xdlops.hpp:56
Definition amd_xdlops.hpp:331
Definition amd_xdlops.hpp:1641
Definition amd_xdlops.hpp:249
Definition amd_xdlops.hpp:1451
Definition amd_xdlops.hpp:1577
Definition amd_xdlops.hpp:139
Definition amd_xdlops.hpp:1514
Definition amd_xdlops.hpp:1388
Definition amd_xdlops.hpp:15
Definition amd_xdlops.hpp:42
Definition amd_xdlops.hpp:317
Definition amd_xdlops.hpp:112
Definition amd_xdlops.hpp:1661
Definition amd_xdlops.hpp:481
Definition amd_xdlops.hpp:289
Definition amd_xdlops.hpp:179
Definition amd_xdlops.hpp:84
Definition amd_xdlops.hpp:221
Definition amd_xdlops.hpp:461
Definition amd_xdlops.hpp:364
Definition amd_xdlops.hpp:442
Definition amd_xdlops.hpp:403
Definition amd_xdlops.hpp:423
Definition amd_xdlops.hpp:383
Definition amd_xdlops.hpp:345
Definition amd_xdlops.hpp:886
Definition amd_xdlops.hpp:666
Definition amd_wmma.hpp:297
Definition amd_wmma.hpp:50
Definition amd_wmma.hpp:418
Definition amd_wmma.hpp:394
Definition amd_wmma.hpp:271
Definition amd_wmma.hpp:25
Definition amd_wmma.hpp:370
Definition amd_wmma.hpp:346
Definition amd_wmma.hpp:319
Definition amd_wmma.hpp:121
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:867
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:862
static constexpr index_t group_size
Definition xdlops_gemm.hpp:859
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:866
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:865
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:869
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:861
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:873
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:863
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:868
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:864
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:860
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:445
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:448
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:440
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:439
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:443
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:444
static constexpr index_t group_size
Definition xdlops_gemm.hpp:438
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:441
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:442
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:447
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:446
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:451
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:312
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:313
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:309
static constexpr index_t group_size
Definition xdlops_gemm.hpp:306
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:315
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:319
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:307
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:314
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:316
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:311
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:310
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:308
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:180
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:174
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:186
static constexpr index_t group_size
Definition xdlops_gemm.hpp:173
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:182
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:178
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:179
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:183
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:181
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:175
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:176
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:177
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:421
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:422
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:420
static constexpr index_t group_size
Definition xdlops_gemm.hpp:416
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:425
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:417
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:424
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:418
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:426
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:429
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:419
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:423
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:728
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:726
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:731
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:737
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:729
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:730
static constexpr index_t group_size
Definition xdlops_gemm.hpp:724
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:732
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:733
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:727
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:725
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:734
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:820
static constexpr index_t group_size
Definition xdlops_gemm.hpp:812
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:825
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:813
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:816
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:819
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:815
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:814
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:818
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:821
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:817
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:822
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:292
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:297
static constexpr index_t group_size
Definition xdlops_gemm.hpp:284
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:288
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:287
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:293
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:289
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:286
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:290
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:291
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:285
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:294
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:772
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:770
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:773
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:776
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:771
static constexpr index_t group_size
Definition xdlops_gemm.hpp:768
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:777
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:781
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:769
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:775
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:778
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:774
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:688
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:687
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:690
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:689
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:681
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:693
static constexpr index_t group_size
Definition xdlops_gemm.hpp:680
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:684
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:683
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:686
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:685
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:682
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:332
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:338
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:331
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:337
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:329
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:341
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:333
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:335
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:336
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:334
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:330
static constexpr index_t group_size
Definition xdlops_gemm.hpp:328
static constexpr index_t group_size
Definition xdlops_gemm.hpp:151
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:156
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:152
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:164
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:157
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:159
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:153
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:155
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:154
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:161
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:160
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:158
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:492
static constexpr index_t group_size
Definition xdlops_gemm.hpp:482
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:487
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:495
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:489
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:488
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:491
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:483
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:490
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:484
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:485
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:486
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:976
static constexpr index_t group_size
Definition xdlops_gemm.hpp:982
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:978
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:979
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:985
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:983
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:984
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:977
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:981
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:986
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:980
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:990
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:378
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:375
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:373
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:374
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:379
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:380
static constexpr index_t group_size
Definition xdlops_gemm.hpp:372
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:381
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:382
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:376
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:385
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:377
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:708
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:705
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:709
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:707
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:703
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:712
static constexpr index_t group_size
Definition xdlops_gemm.hpp:702
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:711
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:704
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:706
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:710
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:715
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:797
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:798
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:800
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:796
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:794
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:792
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:795
static constexpr index_t group_size
Definition xdlops_gemm.hpp:790
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:803
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:799
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:793
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:791
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:268
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:272
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:267
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:271
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:264
static constexpr index_t group_size
Definition xdlops_gemm.hpp:262
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:263
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:275
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:265
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:266
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:269
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:270
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:755
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:752
static constexpr index_t group_size
Definition xdlops_gemm.hpp:746
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:759
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:748
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:747
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:753
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:756
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:750
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:751
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:749
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:754
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:663
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:662
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:664
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:667
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:659
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:668
static constexpr index_t group_size
Definition xdlops_gemm.hpp:658
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:661
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:671
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:665
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:666
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:660
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:113
static constexpr index_t group_size
Definition xdlops_gemm.hpp:107
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:114
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:109
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:111
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:115
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:117
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:108
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:116
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:110
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:120
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:112
static constexpr index_t group_size
Definition xdlops_gemm.hpp:129
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:142
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:138
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:135
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:134
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:137
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:132
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:139
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:133
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:131
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:136
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:130
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:463
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:469
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:473
static constexpr index_t group_size
Definition xdlops_gemm.hpp:460
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:470
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:461
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:466
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:465
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:467
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:468
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:464
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:462
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:226
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:224
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:221
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:227
static constexpr index_t group_size
Definition xdlops_gemm.hpp:218
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:220
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:231
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:219
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:222
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:228
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:225
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:223
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:1009
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:1004
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:1007
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:1008
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:999
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:1006
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:1000
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:1003
static constexpr index_t group_size
Definition xdlops_gemm.hpp:1005
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:1001
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1012
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:1002
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:845
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:836
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:849
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:837
static constexpr index_t group_size
Definition xdlops_gemm.hpp:835
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:842
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:840
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:838
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:841
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:839
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:844
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:843
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:403
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:402
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:398
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:396
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:407
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:400
static constexpr index_t group_size
Definition xdlops_gemm.hpp:394
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:399
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:395
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:397
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:401
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:404
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:249
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:246
static constexpr index_t group_size
Definition xdlops_gemm.hpp:240
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:247
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:250
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:243
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:242
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:244
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:248
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:245
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:241
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:253
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:209
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:197
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:204
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:199
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:203
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:198
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:202
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:206
static constexpr index_t group_size
Definition xdlops_gemm.hpp:196
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:200
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:205
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:201
static constexpr index_t group_size
Definition xdlops_gemm.hpp:350
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:353
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:358
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:357
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:351
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:363
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:355
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:360
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:352
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:354
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:356
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:359
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:644
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:642
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:639
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:649
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:638
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:640
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:645
static constexpr index_t group_size
Definition xdlops_gemm.hpp:636
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:646
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:643
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:637
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:641
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:539
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:532
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:528
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:530
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:536
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:527
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:531
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:534
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:535
static constexpr index_t group_size
Definition xdlops_gemm.hpp:526
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:533
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:529
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:577
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:575
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:579
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:574
static constexpr index_t group_size
Definition xdlops_gemm.hpp:570
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:578
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:583
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:573
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:580
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:576
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:571
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:572
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:621
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:627
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:622
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:618
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:619
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:620
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:615
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:624
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:616
static constexpr index_t group_size
Definition xdlops_gemm.hpp:614
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:617
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:623
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:561
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:553
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:550
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:555
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:554
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:556
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:552
static constexpr index_t group_size
Definition xdlops_gemm.hpp:548
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:557
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:551
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:549
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:558
static constexpr index_t group_size
Definition xdlops_gemm.hpp:592
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:595
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:599
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:598
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:596
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:601
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:605
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:593
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:602
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:597
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:600
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:594
static constexpr index_t group_size
Definition xdlops_gemm.hpp:504
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:506
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:517
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:507
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:513
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:512
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:511
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:514
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:509
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:510
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:508
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:505
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:930
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:922
__device__ void run(const FloatA &a, const ScaleA &scale_a, const FloatB &b, const ScaleB &scale_b, FloatC ®_c) const
Definition xdlops_gemm.hpp:942
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:928
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:925
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:927
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:926
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:929
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:924
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:921
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:923
static constexpr index_t group_size
Definition xdlops_gemm.hpp:920
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:890
__device__ void run(const FloatA &a, const ScaleA &scale_a, const FloatB &b, const ScaleB &scale_b, FloatC ®_c) const
Definition xdlops_gemm.hpp:905
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:893
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:887
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:891
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:884
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:885
static constexpr index_t group_size
Definition xdlops_gemm.hpp:883
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:886
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:888
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:892
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:889
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1048
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1112
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1170
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1160
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1038
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1102
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1150
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1140
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1065
__device__ void run(const FloatA &a, const FloatB &b, FloatC ®_c) const
Definition xdlops_gemm.hpp:1129
__device__ void run(const FloatA &, const FloatB &, FloatC &) const
Definition xdlops_gemm.hpp:1076
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:1074
__device__ void run(const FloatA &, const FloatB &, FloatC &) const
Definition xdlops_gemm.hpp:1181
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:1179
Definition xdlops_gemm.hpp:1020
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:1029
static constexpr index_t group_size
Definition xdlops_gemm.hpp:1021
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:1028
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:1031
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:1024
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:1027
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:1025
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:1026
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:1022
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:1023
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:1030
Definition xdlops_gemm.hpp:1084
static constexpr index_t n_per_blk
Definition xdlops_gemm.hpp:1093
static constexpr index_t group_size
Definition xdlops_gemm.hpp:1085
static constexpr index_t num_output_blks
Definition xdlops_gemm.hpp:1091
static constexpr index_t m_per_blk
Definition xdlops_gemm.hpp:1092
static constexpr index_t num_threads_per_blk
Definition xdlops_gemm.hpp:1088
static constexpr bool is_k_reduction
Definition xdlops_gemm.hpp:1095
static constexpr index_t num_regs_per_blk
Definition xdlops_gemm.hpp:1087
static constexpr index_t num_groups_per_blk
Definition xdlops_gemm.hpp:1086
static constexpr index_t num_input_blks
Definition xdlops_gemm.hpp:1090
static constexpr index_t wave_size
Definition xdlops_gemm.hpp:1089
static constexpr index_t k_per_blk
Definition xdlops_gemm.hpp:1094
Definition xdlops_gemm.hpp:102
Definition functional2.hpp:33