Invoker Struct Reference

Invoker Struct Reference#

Composable Kernel: ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker Struct Reference
ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker Struct Reference

#include <device_gemm_xdl_layernorm_cshuffle.hpp>

Inheritance diagram for ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker:
ck::tensor_operation::device::BaseInvoker

Public Types

using Argument = DeviceOp::Argument

Public Member Functions

template<typename GridwiseGemm>
float RunImp (const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
INVOKER_RUN_IMPL float Run (const BaseArgument *p_arg, const StreamConfig &stream_config=StreamConfig{}) override
Public Member Functions inherited from ck::tensor_operation::device::BaseInvoker
 BaseInvoker ()=default
 BaseInvoker (const BaseInvoker &)=default
BaseInvokeroperator= (const BaseInvoker &)=default
virtual ~BaseInvoker ()

Member Typedef Documentation

◆ Argument

template<typename ALayout, typename BLayout, typename CLayout, typename ADataType, typename BDataType, typename CDataType, typename C0DataType, typename GemmAccDataType, typename CShuffleDataType, typename ReduceAccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
using ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker::Argument = DeviceOp::Argument

Member Function Documentation

◆ Run()

template<typename ALayout, typename BLayout, typename CLayout, typename ADataType, typename BDataType, typename CDataType, typename C0DataType, typename GemmAccDataType, typename CShuffleDataType, typename ReduceAccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
INVOKER_RUN_IMPL float ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker::Run ( const BaseArgument * p_arg,
const StreamConfig & stream_config = StreamConfig{} )
inlineoverridevirtual

◆ RunImp()

template<typename ALayout, typename BLayout, typename CLayout, typename ADataType, typename BDataType, typename CDataType, typename C0DataType, typename GemmAccDataType, typename CShuffleDataType, typename ReduceAccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
template<typename GridwiseGemm>
float ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, C0DataType, GemmAccDataType, CShuffleDataType, ReduceAccDataType, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched >::Invoker::RunImp ( const Argument & arg,
const StreamConfig & stream_config = StreamConfig{} )
inline

The documentation for this struct was generated from the following file: