25template <
typename Src0Data,
31 typename ElementwiseOperation,
32 typename SliceLengths,
33 typename DimAccessOrder,
37 bool Src0ResetCoordinateAfterRun,
38 bool Src1ResetCoordinateAfterRun,
39 bool DstResetCoordinateAfterRun>
53 const Index& src0_slice_origin,
54 const Src1Desc& src1_desc,
55 const Index& src1_slice_origin,
56 const DstDesc& dst_desc,
57 const Index& dst_slice_origin,
58 const ElementwiseOperation& element_op)
62 element_op_(element_op)
65 "wrong! cannot evenly divide");
69 const Index& src0_slice_origin_idx)
75 const Index& src1_slice_origin_idx)
85 template <
typename Src0Buffer,
typename Src1Buffer,
typename DstBuffer>
86 __device__
void Run(
const Src0Desc& src0_desc,
87 const Src0Buffer& src0_buf,
88 const Src1Desc& src1_desc,
89 const Src1Buffer& src1_buf,
90 const DstDesc& dst_desc,
107 using src0_vector_t =
typename src0_vector_type::type;
110 using src1_vector_t =
typename src1_vector_type::type;
113 using dst_vector_t =
typename dst_vector_type::type;
115 const bool is_src0_valid =
118 const bool is_src1_valid =
122 auto src0_vector_container = src0_vector_type{
123 src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
125 auto src1_vector_container = src1_vector_type{
126 src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
128 auto dst_vector_container = dst_vector_type{};
132 element_op_(dst_vector_container.template AsType<DstData>()(i),
133 src0_vector_container.template AsType<Src0Data>()[i],
134 src1_vector_container.template AsType<Src1Data>()[i]);
137 const bool is_dst_valid =
141 dst_buf.template Update<DstInMemOp, dst_vector_t>(
142 dst_coord_.GetOffset(),
144 dst_vector_container.template AsType<dst_vector_t>()[
I0]);
147 if constexpr(idx_1d.value != num_access - 1)
160 if constexpr(Src0ResetCoordinateAfterRun)
162 const auto src0_reset_step =
168 if constexpr(Src1ResetCoordinateAfterRun)
170 const auto src1_reset_step =
176 if constexpr(DstResetCoordinateAfterRun)
178 const auto dst_reset_step =
195 if constexpr(num_access == 0)
201 constexpr auto reset_step =
210 const Index& src0_slice_origin_step_idx)
213 const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
214 ? src0_slice_origin_step_idx
225 const Index& src1_slice_origin_step_idx)
228 const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
229 ? src1_slice_origin_step_idx
240 const Index& dst_slice_origin_step_idx)
243 const auto adjusted_step_idx = DstResetCoordinateAfterRun
244 ? dst_slice_origin_step_idx
257 const ElementwiseOperation element_op_;
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
InMemoryDataOperationEnum
Definition ck.hpp:277
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
Definition tensor_space_filling_curve.hpp:20
static __device__ __host__ constexpr auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition tensor_space_filling_curve.hpp:52
__host__ static __device__ constexpr index_t GetNumOfAccess()
Definition tensor_space_filling_curve.hpp:41
static __device__ __host__ constexpr auto GetForwardStep(Number< AccessIdx1d >)
Definition tensor_space_filling_curve.hpp:66
MultiIndex< nDim > Index
Definition tensor_space_filling_curve.hpp:23
__device__ void SetSrc0SliceOrigin(const Src0Desc &src0_desc, const Index &src0_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:68
__device__ void SetSrc1SliceOrigin(const Src1Desc &src1_desc, const Index &src1_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:74
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v6r2.hpp:44
static __device__ constexpr auto GetCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v6r2.hpp:185
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:80
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v6r2.hpp:50
__device__ void MoveSrc0SliceWindow(const Src0Desc &src0_desc, const Index &src0_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:209
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::DstCoord decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition threadwise_tensor_slice_transfer_v6r2.hpp:48
__device__ void Run(const Src0Desc &src0_desc, const Src0Buffer &src0_buf, const Src1Desc &src1_desc, const Src1Buffer &src1_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:86
__device__ void MoveSrc1SliceWindow(const Src1Desc &src1_desc, const Index &src1_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:224
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:239
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::Src0Coord decltype(make_tensor_coordinate(Src0Desc{}, Index{})) Src0Coord
Definition threadwise_tensor_slice_transfer_v6r2.hpp:46
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v6r2.hpp:42
ck::ThreadwiseTensorSliceTransfer_v6r2< Src0Data, Src1Data, DstData, Src0Desc, Src1Desc, DstDesc, ElementwiseOperation, decltype(thread_slice_lengths), DimAccessOrder, VectorDim, ScalarPerVector, DstInMemOp, ThreadTransferSrc0ResetCoordinateAfterRun, ThreadTransferSrc1ResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun >::Src1Coord decltype(make_tensor_coordinate(Src1Desc{}, Index{})) Src1Coord
Definition threadwise_tensor_slice_transfer_v6r2.hpp:47
__device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc &src0_desc, const Index &src0_slice_origin, const Src1Desc &src1_desc, const Index &src1_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin, const ElementwiseOperation &element_op)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:52
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition functional2.hpp:33