8#ifndef CK_TILE_FLATMM_UK_MFMA
9#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
12#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
13#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
14#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
15#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
18#ifndef CK_TILE_FLATMM_UK_2B
19#define CK_TILE_FLATMM_UK_2B 0
23 "buffer_load_dword %[v_os_a0], s[16:19], 0 offen lds\n" \
24 "s_add_u32 m0, %[s_size_per_issue], m0\n"
27 "buffer_load_dword %[v_os_a1], s[16:19], 0 offen lds\n" \
28 "s_add_u32 m0, %[s_size_per_issue], m0\n"
31 "buffer_load_dword %[v_os_a2], s[16:19], 0 offen lds\n" \
32 "s_add_u32 m0, %[s_size_per_issue], m0\n"
35 "buffer_load_dword %[v_os_a3], s[16:19], 0 offen lds\n" \
36 "s_add_u32 m0, %[s_size_per_issue], m0\n"
39 "buffer_load_dword %[v_os_a4], s[16:19], 0 offen lds\n" \
40 "s_add_u32 m0, %[s_size_per_issue], m0\n"
43 "buffer_load_dword %[v_os_a5], s[16:19], 0 offen lds\n" \
44 "s_add_u32 m0, %[s_size_per_issue], m0\n"
47 "buffer_load_dword %[v_os_a6], s[16:19], 0 offen lds\n" \
48 "s_add_u32 m0, %[s_size_per_issue], m0\n"
50#define _UK_GLD_A7_AND_L1 \
51 "buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds\n" \
52 "s_add_u32 m0, %[smem_sz], %[s_m0_init]\n"
54#define _UK_GLD_A7_AND_L0 \
55 "buffer_load_dword %[v_os_a7], s[16:19], 0 offen lds\n" \
56 "s_add_u32 m0, 0, %[s_m0_init]\n"
60#define _UK_SLD_A0_X "ds_read_b128 v[64:67], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_0]\n"
61#define _UK_SLD_A1_X "ds_read_b128 v[68:71], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_1]\n"
62#define _UK_SLD_A2_X "ds_read_b128 v[72:75], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_2]\n"
63#define _UK_SLD_A3_X "ds_read_b128 v[76:79], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_3]\n"
64#define _UK_SLD_A4_X "ds_read_b128 v[80:83], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_4]\n"
65#define _UK_SLD_A5_X "ds_read_b128 v[84:87], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_5]\n"
66#define _UK_SLD_A6_X "ds_read_b128 v[88:91], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_6]\n"
67#define _UK_SLD_A7_X "ds_read_b128 v[92:95], %[v_os_slda] offset:0*%[smem_sz] + %[sld_os_7]\n"
69#define _UK_SLD_A0_Y "ds_read_b128 v[96 : 99], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_0]\n"
70#define _UK_SLD_A1_Y "ds_read_b128 v[100:103], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_1]\n"
71#define _UK_SLD_A2_Y "ds_read_b128 v[104:107], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_2]\n"
72#define _UK_SLD_A3_Y "ds_read_b128 v[108:111], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_3]\n"
73#define _UK_SLD_A4_Y "ds_read_b128 v[112:115], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_4]\n"
74#define _UK_SLD_A5_Y "ds_read_b128 v[116:119], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_5]\n"
75#define _UK_SLD_A6_Y "ds_read_b128 v[120:123], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_6]\n"
76#define _UK_SLD_A7_Y "ds_read_b128 v[124:127], %[v_os_slda], offset:1*%[smem_sz] + %[sld_os_7]\n"
78#define _UK_PIPELINE_0(gld_a0_, gld_a1_, gld_a2_, gld_a3_, gld_a4_, gld_a5_, gld_a6_, gld_a7_, \
79 sld_a0_, sld_a1_, sld_a2_, sld_a3_, sld_a4_, sld_a5_, sld_a6_, sld_a7_, \
80 src_a0_, src_a1_, src_a2_, src_a3_, src_a4_, src_a5_, src_a6_, src_a7_, \
81 src_a8_, src_a9_, src_a10_, src_a11_, src_a12_, src_a13_, src_a14_, src_a15_, \
82 fence_0_, fence_1_, fence_2_, fence_3_, \
83 acc_0_, acc_1_, acc_2_, acc_3_, acc_4_, acc_5_, acc_6_, acc_7_, \
84 acc_8_, acc_9_, acc_10_, acc_11_, acc_12_, acc_13_, acc_14_, acc_15_, \
87 _UK_MFMA_ " " acc_0_ ", acc[0:1], " src_a0_ ", " acc_0_ "\n" \
88 _UK_MFMA_ " " acc_0_ ", acc[2:3], " src_a1_ ", " acc_0_ "\n" \
89 "buffer_load_dwordx4 acc[128:131], %[v_os_b0], " s_base_b_ ", 0 offen \n" \
90 _UK_MFMA_ " " acc_0_ ", acc[4:5], " src_a2_ ", " acc_0_ "\n" \
91 _UK_MFMA_ " " acc_0_ ", acc[6:7], " src_a3_ ", " acc_0_ "\n" \
93 _UK_MFMA_ " " acc_0_ ", acc[8:9], " src_a4_ ", " acc_0_ "\n" \
94 _UK_MFMA_ " " acc_0_ ", acc[10:11], " src_a5_ ", " acc_0_ "\n" \
95 "buffer_load_dwordx4 acc[132:135], %[v_os_b0], " s_base_b_ ", 0 offen offset:1024 \n" \
96 _UK_MFMA_ " " acc_0_ ", acc[12:13], " src_a6_ ", " acc_0_ "\n" \
97 _UK_MFMA_ " " acc_0_ ", acc[14:15], " src_a7_ ", " acc_0_ "\n" \
99 _UK_MFMA_ " " acc_1_ ", acc[0:1], " src_a8_ ", " acc_1_ "\n" \
100 _UK_MFMA_ " " acc_1_ ", acc[2:3], " src_a9_ ", " acc_1_ "\n" \
101 "buffer_load_dwordx4 acc[136:139], %[v_os_b0], " s_base_b_ ", 0 offen offset:2048 \n" \
102 _UK_MFMA_ " " acc_1_ ", acc[4:5], " src_a10_ ", " acc_1_ "\n" \
103 _UK_MFMA_ " " acc_1_ ", acc[6:7], " src_a11_ ", " acc_1_ "\n" \
105 _UK_MFMA_ " " acc_1_ ", acc[8:9], " src_a12_ ", " acc_1_ "\n" \
106 _UK_MFMA_ " " acc_1_ ", acc[10:11], " src_a13_ ", " acc_1_ "\n" \
107 "buffer_load_dwordx4 acc[140:143], %[v_os_b0], " s_base_b_ ", 0 offen offset:3072 \n" \
108 _UK_MFMA_ " " acc_1_ ", acc[12:13], " src_a14_ ", " acc_1_ "\n" \
109 _UK_MFMA_ " " acc_1_ ", acc[14:15], " src_a15_ ", " acc_1_ "\n" \
111 _UK_MFMA_ " " acc_2_ ", acc[16:17], " src_a0_ ", " acc_2_ "\n" \
112 _UK_MFMA_ " " acc_2_ ", acc[18:19], " src_a1_ ", " acc_2_ "\n" \
113 "buffer_load_dwordx4 acc[144:147], %[v_os_b1], " s_base_b_ ", 0 offen \n" \
114 _UK_MFMA_ " " acc_2_ ", acc[20:21], " src_a2_ ", " acc_2_ "\n" \
115 _UK_MFMA_ " " acc_2_ ", acc[22:23], " src_a3_ ", " acc_2_ "\n" \
117 _UK_MFMA_ " " acc_2_ ", acc[24:25], " src_a4_ ", " acc_2_ "\n" \
118 _UK_MFMA_ " " acc_2_ ", acc[26:27], " src_a5_ ", " acc_2_ "\n" \
119 "buffer_load_dwordx4 acc[148:151], %[v_os_b1], " s_base_b_ ", 0 offen offset:1024 \n" \
120 _UK_MFMA_ " " acc_2_ ", acc[28:29], " src_a6_ ", " acc_2_ "\n" \
121 _UK_MFMA_ " " acc_2_ ", acc[30:31], " src_a7_ ", " acc_2_ "\n" \
123 _UK_MFMA_ " " acc_3_ ", acc[16:17], " src_a8_ ", " acc_3_ "\n" \
124 _UK_MFMA_ " " acc_3_ ", acc[18:19], " src_a9_ ", " acc_3_ "\n" \
125 "buffer_load_dwordx4 acc[152:155], %[v_os_b1], " s_base_b_ ", 0 offen offset:2048 \n" \
126 _UK_MFMA_ " " acc_3_ ", acc[20:21], " src_a10_ ", " acc_3_ "\n" \
127 _UK_MFMA_ " " acc_3_ ", acc[22:23], " src_a11_ ", " acc_3_ "\n" \
129 _UK_MFMA_ " " acc_3_ ", acc[24:25], " src_a12_ ", " acc_3_ "\n" \
130 _UK_MFMA_ " " acc_3_ ", acc[26:27], " src_a13_ ", " acc_3_ "\n" \
131 "buffer_load_dwordx4 acc[156:159], %[v_os_b1], " s_base_b_ ", 0 offen offset:3072 \n" \
132 _UK_MFMA_ " " acc_3_ ", acc[28:29], " src_a14_ ", " acc_3_ "\n" \
133 _UK_MFMA_ " " acc_3_ ", acc[30:31], " src_a15_ ", " acc_3_ "\n" \
136 _UK_MFMA_ " " acc_4_ ", acc[32:33], " src_a0_ ", " acc_4_ "\n" \
137 _UK_MFMA_ " " acc_4_ ", acc[34:35], " src_a1_ ", " acc_4_ "\n" \
138 "buffer_load_dwordx4 acc[160:163], %[v_os_b2], " s_base_b_ ", 0 offen \n" \
139 _UK_MFMA_ " " acc_4_ ", acc[36:37], " src_a2_ ", " acc_4_ "\n" \
140 _UK_MFMA_ " " acc_4_ ", acc[38:39], " src_a3_ ", " acc_4_ "\n" \
142 _UK_MFMA_ " " acc_4_ ", acc[40:41], " src_a4_ ", " acc_4_ "\n" \
143 _UK_MFMA_ " " acc_4_ ", acc[42:43], " src_a5_ ", " acc_4_ "\n" \
144 "buffer_load_dwordx4 acc[164:167], %[v_os_b2], " s_base_b_ ", 0 offen offset:1024 \n" \
145 _UK_MFMA_ " " acc_4_ ", acc[44:45], " src_a6_ ", " acc_4_ "\n" \
146 _UK_MFMA_ " " acc_4_ ", acc[46:47], " src_a7_ ", " acc_4_ "\n" \
148 _UK_MFMA_ " " acc_5_ ", acc[32:33], " src_a8_ ", " acc_5_ "\n" \
149 _UK_MFMA_ " " acc_5_ ", acc[34:35], " src_a9_ ", " acc_5_ "\n" \
150 "buffer_load_dwordx4 acc[168:171], %[v_os_b2], " s_base_b_ ", 0 offen offset:2048 \n" \
151 _UK_MFMA_ " " acc_5_ ", acc[36:37], " src_a10_ ", " acc_5_ "\n" \
152 _UK_MFMA_ " " acc_5_ ", acc[38:39], " src_a11_ ", " acc_5_ "\n" \
154 _UK_MFMA_ " " acc_5_ ", acc[40:41], " src_a12_ ", " acc_5_ "\n" \
155 _UK_MFMA_ " " acc_5_ ", acc[42:43], " src_a13_ ", " acc_5_ "\n" \
156 "buffer_load_dwordx4 acc[172:175], %[v_os_b2], " s_base_b_ ", 0 offen offset:3072 \n" \
157 _UK_MFMA_ " " acc_5_ ", acc[44:45], " src_a14_ ", " acc_5_ "\n" \
158 _UK_MFMA_ " " acc_5_ ", acc[46:47], " src_a15_ ", " acc_5_ "\n" \
160 _UK_MFMA_ " " acc_6_ ", acc[48:49], " src_a0_ ", " acc_6_ "\n" \
161 _UK_MFMA_ " " acc_6_ ", acc[50:51], " src_a1_ ", " acc_6_ "\n" \
162 "buffer_load_dwordx4 acc[176:179], %[v_os_b3], " s_base_b_ ", 0 offen \n" \
163 _UK_MFMA_ " " acc_6_ ", acc[52:53], " src_a2_ ", " acc_6_ "\n" \
164 _UK_MFMA_ " " acc_6_ ", acc[54:55], " src_a3_ ", " acc_6_ "\n" \
166 _UK_MFMA_ " " acc_6_ ", acc[56:57], " src_a4_ ", " acc_6_ "\n" \
167 _UK_MFMA_ " " acc_6_ ", acc[58:59], " src_a5_ ", " acc_6_ "\n" \
168 "buffer_load_dwordx4 acc[180:183], %[v_os_b3], " s_base_b_ ", 0 offen offset:1024 \n" \
169 _UK_MFMA_ " " acc_6_ ", acc[60:61], " src_a6_ ", " acc_6_ "\n" \
170 _UK_MFMA_ " " acc_6_ ", acc[62:63], " src_a7_ ", " acc_6_ "\n" \
172 _UK_MFMA_ " " acc_7_ ", acc[48:49], " src_a8_ ", " acc_7_ "\n" \
173 _UK_MFMA_ " " acc_7_ ", acc[50:51], " src_a9_ ", " acc_7_ "\n" \
174 "buffer_load_dwordx4 acc[184:187], %[v_os_b3], " s_base_b_ ", 0 offen offset:2048 \n" \
175 _UK_MFMA_ " " acc_7_ ", acc[52:53], " src_a10_ ", " acc_7_ "\n" \
176 _UK_MFMA_ " " acc_7_ ", acc[54:55], " src_a11_ ", " acc_7_ "\n" \
178 _UK_MFMA_ " " acc_7_ ", acc[56:57], " src_a12_ ", " acc_7_ "\n" \
179 _UK_MFMA_ " " acc_7_ ", acc[58:59], " src_a13_ ", " acc_7_ "\n" \
180 "buffer_load_dwordx4 acc[188:191], %[v_os_b3], " s_base_b_ ", 0 offen offset:3072 \n" \
181 _UK_MFMA_ " " acc_7_ ", acc[60:61], " src_a14_ ", " acc_7_ "\n" \
182 _UK_MFMA_ " " acc_7_ ", acc[62:63], " src_a15_ ", " acc_7_ "\n" \
185 _UK_MFMA_ " " acc_8_ ", acc[64:65], " src_a0_ ", " acc_8_ "\n" \
186 _UK_MFMA_ " " acc_8_ ", acc[66:67], " src_a1_ ", " acc_8_ "\n" \
187 "buffer_load_dwordx4 acc[192:195], %[v_os_b4], " s_base_b_ ", 0 offen \n" \
188 _UK_MFMA_ " " acc_8_ ", acc[68:69], " src_a2_ ", " acc_8_ "\n" \
189 _UK_MFMA_ " " acc_8_ ", acc[70:71], " src_a3_ ", " acc_8_ "\n" \
190 _UK_MFMA_ " " acc_8_ ", acc[72:73], " src_a4_ ", " acc_8_ "\n" \
191 _UK_MFMA_ " " acc_8_ ", acc[74:75], " src_a5_ ", " acc_8_ "\n" \
192 "buffer_load_dwordx4 acc[196:199], %[v_os_b4], " s_base_b_ ", 0 offen offset:1024 \n" \
193 _UK_MFMA_ " " acc_8_ ", acc[76:77], " src_a6_ ", " acc_8_ "\n" \
194 _UK_MFMA_ " " acc_8_ ", acc[78:79], " src_a7_ ", " acc_8_ "\n" \
195 _UK_MFMA_ " " acc_9_ ", acc[64:65], " src_a8_ ", " acc_9_ "\n" \
196 _UK_MFMA_ " " acc_9_ ", acc[66:67], " src_a9_ ", " acc_9_ "\n" \
197 "buffer_load_dwordx4 acc[200:203], %[v_os_b4], " s_base_b_ ", 0 offen offset:2048 \n" \
198 _UK_MFMA_ " " acc_9_ ", acc[68:69], " src_a10_ ", " acc_9_ "\n" \
199 _UK_MFMA_ " " acc_9_ ", acc[70:71], " src_a11_ ", " acc_9_ "\n" \
200 _UK_MFMA_ " " acc_9_ ", acc[72:73], " src_a12_ ", " acc_9_ "\n" \
201 _UK_MFMA_ " " acc_9_ ", acc[74:75], " src_a13_ ", " acc_9_ "\n" \
202 "buffer_load_dwordx4 acc[204:207], %[v_os_b4], " s_base_b_ ", 0 offen offset:3072 \n" \
203 _UK_MFMA_ " " acc_9_ ", acc[76:77], " src_a14_ ", " acc_9_ "\n" \
204 _UK_MFMA_ " " acc_9_ ", acc[78:79], " src_a15_ ", " acc_9_ "\n" \
205 _UK_MFMA_ " " acc_10_ ", acc[80:81], " src_a0_ ", " acc_10_ "\n" \
206 _UK_MFMA_ " " acc_10_ ", acc[82:83], " src_a1_ ", " acc_10_ "\n" \
207 "buffer_load_dwordx4 acc[208:211], %[v_os_b5], " s_base_b_ ", 0 offen \n" \
208 _UK_MFMA_ " " acc_10_ ", acc[84:85], " src_a2_ ", " acc_10_ "\n" \
209 _UK_MFMA_ " " acc_10_ ", acc[86:87], " src_a3_ ", " acc_10_ "\n" \
210 _UK_MFMA_ " " acc_10_ ", acc[88:89], " src_a4_ ", " acc_10_ "\n" \
211 _UK_MFMA_ " " acc_10_ ", acc[90:91], " src_a5_ ", " acc_10_ "\n" \
212 "buffer_load_dwordx4 acc[212:215], %[v_os_b5], " s_base_b_ ", 0 offen offset:1024 \n" \
213 _UK_MFMA_ " " acc_10_ ", acc[92:93], " src_a6_ ", " acc_10_ "\n" \
214 _UK_MFMA_ " " acc_10_ ", acc[94:95], " src_a7_ ", " acc_10_ "\n" \
215 _UK_MFMA_ " " acc_11_ ", acc[80:81], " src_a8_ ", " acc_11_ "\n" \
216 _UK_MFMA_ " " acc_11_ ", acc[82:83], " src_a9_ ", " acc_11_ "\n" \
217 "buffer_load_dwordx4 acc[216:219], %[v_os_b5], " s_base_b_ ", 0 offen offset:2048 \n" \
218 _UK_MFMA_ " " acc_11_ ", acc[84:85], " src_a10_ ", " acc_11_ "\n" \
219 _UK_MFMA_ " " acc_11_ ", acc[86:87], " src_a11_ ", " acc_11_ "\n" \
220 _UK_MFMA_ " " acc_11_ ", acc[88:89], " src_a12_ ", " acc_11_ "\n" \
221 _UK_MFMA_ " " acc_11_ ", acc[90:91], " src_a13_ ", " acc_11_ "\n" \
222 "buffer_load_dwordx4 acc[220:223], %[v_os_b5], " s_base_b_ ", 0 offen offset:3072 \n" \
223 _UK_MFMA_ " " acc_11_ ", acc[92:93], " src_a14_ ", " acc_11_ "\n" \
224 _UK_MFMA_ " " acc_11_ ", acc[94:95], " src_a15_ ", " acc_11_ "\n" \
226 _UK_MFMA_ " " acc_12_ ", acc[96:97], " src_a0_ ", " acc_12_ "\n" \
227 _UK_MFMA_ " " acc_12_ ", acc[98:99], " src_a1_ ", " acc_12_ "\n" \
228 "buffer_load_dwordx4 acc[224:227], %[v_os_b6], " s_base_b_ ", 0 offen \n" \
229 _UK_MFMA_ " " acc_12_ ", acc[100:101], " src_a2_ ", " acc_12_ "\n" \
230 _UK_MFMA_ " " acc_12_ ", acc[102:103], " src_a3_ ", " acc_12_ "\n" \
231 _UK_MFMA_ " " acc_12_ ", acc[104:105], " src_a4_ ", " acc_12_ "\n" \
232 _UK_MFMA_ " " acc_12_ ", acc[106:107], " src_a5_ ", " acc_12_ "\n" \
233 "buffer_load_dwordx4 acc[228:231], %[v_os_b6], " s_base_b_ ", 0 offen offset:1024 \n" \
234 _UK_MFMA_ " " acc_12_ ", acc[108:109], " src_a6_ ", " acc_12_ "\n" \
235 _UK_MFMA_ " " acc_12_ ", acc[110:111], " src_a7_ ", " acc_12_ "\n" \
236 _UK_MFMA_ " " acc_13_ ", acc[96:97], " src_a8_ ", " acc_13_ "\n" \
237 _UK_MFMA_ " " acc_13_ ", acc[98:99], " src_a9_ ", " acc_13_ "\n" \
238 "buffer_load_dwordx4 acc[232:235], %[v_os_b6], " s_base_b_ ", 0 offen offset:2048 \n" \
239 _UK_MFMA_ " " acc_13_ ", acc[100:101], " src_a10_ ", " acc_13_ "\n" \
240 _UK_MFMA_ " " acc_13_ ", acc[102:103], " src_a11_ ", " acc_13_ "\n" \
241 _UK_MFMA_ " " acc_13_ ", acc[104:105], " src_a12_ ", " acc_13_ "\n" \
242 _UK_MFMA_ " " acc_13_ ", acc[106:107], " src_a13_ ", " acc_13_ "\n" \
243 "buffer_load_dwordx4 acc[236:239], %[v_os_b6], " s_base_b_ ", 0 offen offset:3072 \n" \
244 _UK_MFMA_ " " acc_13_ ", acc[108:109], " src_a14_ ", " acc_13_ "\n" \
245 _UK_MFMA_ " " acc_13_ ", acc[110:111], " src_a15_ ", " acc_13_ "\n" \
246 _UK_MFMA_ " " acc_14_ ", acc[112:113], " src_a0_ ", " acc_14_ "\n" \
247 _UK_MFMA_ " " acc_14_ ", acc[114:115], " src_a1_ ", " acc_14_ "\n" \
248 "buffer_load_dwordx4 acc[240:243], %[v_os_b7], " s_base_b_ ", 0 offen \n" \
249 _UK_MFMA_ " " acc_14_ ", acc[116:117], " src_a2_ ", " acc_14_ "\n" \
250 _UK_MFMA_ " " acc_14_ ", acc[118:119], " src_a3_ ", " acc_14_ "\n" \
251 _UK_MFMA_ " " acc_14_ ", acc[120:121], " src_a4_ ", " acc_14_ "\n" \
252 _UK_MFMA_ " " acc_14_ ", acc[122:123], " src_a5_ ", " acc_14_ "\n" \
253 "buffer_load_dwordx4 acc[244:247], %[v_os_b7], " s_base_b_ ", 0 offen offset:1024 \n" \
254 _UK_MFMA_ " " acc_14_ ", acc[124:125], " src_a6_ ", " acc_14_ "\n" \
255 _UK_MFMA_ " " acc_14_ ", acc[126:127], " src_a7_ ", " acc_14_ "\n" \
256 _UK_MFMA_ " " acc_15_ ", acc[112:113], " src_a8_ ", " acc_15_ "\n" \
257 _UK_MFMA_ " " acc_15_ ", acc[114:115], " src_a9_ ", " acc_15_ "\n" \
258 "buffer_load_dwordx4 acc[248:251], %[v_os_b7], " s_base_b_ ", 0 offen offset:2048 \n" \
259 _UK_MFMA_ " " acc_15_ ", acc[116:117], " src_a10_ ", " acc_15_ "\n" \
260 _UK_MFMA_ " " acc_15_ ", acc[118:119], " src_a11_ ", " acc_15_ "\n" \
261 _UK_MFMA_ " " acc_15_ ", acc[120:121], " src_a12_ ", " acc_15_ "\n" \
262 _UK_MFMA_ " " acc_15_ ", acc[122:123], " src_a13_ ", " acc_15_ "\n" \
263 "buffer_load_dwordx4 acc[252:255], %[v_os_b7], " s_base_b_ ", 0 offen offset:3072\n" \
264 _UK_MFMA_ " " acc_15_ ", acc[124:125], " src_a14_ ", " acc_15_ "\n" \
265 _UK_MFMA_ " " acc_15_ ", acc[126:127], " src_a15_ ", " acc_15_ "\n"
267#define _UK_PIPELINE_1(gld_a0_, gld_a1_, gld_a2_, gld_a3_, gld_a4_, gld_a5_, gld_a6_, gld_a7_, \
268 sld_a0_, sld_a1_, sld_a2_, sld_a3_, sld_a4_, sld_a5_, sld_a6_, sld_a7_, \
269 src_a0_, src_a1_, src_a2_, src_a3_, src_a4_, src_a5_, src_a6_, src_a7_, \
270 src_a8_, src_a9_, src_a10_, src_a11_, src_a12_, src_a13_, src_a14_, src_a15_, \
271 fence_0_, fence_1_, fence_2_, fence_3_, \
272 acc_0_, acc_1_, acc_2_, acc_3_, acc_4_, acc_5_, acc_6_, acc_7_, \
273 acc_8_, acc_9_, acc_10_, acc_11_, acc_12_, acc_13_, acc_14_, acc_15_, \
276 _UK_MFMA_ " " acc_0_ ", acc[128:129], " src_a0_ ", " acc_0_ "\n" \
277 _UK_MFMA_ " " acc_0_ ", acc[130:131], " src_a1_ ", " acc_0_ "\n" \
278 "buffer_load_dwordx4 acc[0:3], %[v_os_b0], " s_base_b_ ", 0 offen \n" \
279 _UK_MFMA_ " " acc_0_ ", acc[132:133], " src_a2_ ", " acc_0_ "\n" \
280 _UK_MFMA_ " " acc_0_ ", acc[134:135], " src_a3_ ", " acc_0_ "\n" \
282 _UK_MFMA_ " " acc_0_ ", acc[136:137], " src_a4_ ", " acc_0_ "\n" \
283 _UK_MFMA_ " " acc_0_ ", acc[138:139], " src_a5_ ", " acc_0_ "\n" \
284 "buffer_load_dwordx4 acc[4:7], %[v_os_b0], " s_base_b_ ", 0 offen offset:1024 \n" \
285 _UK_MFMA_ " " acc_0_ ", acc[140:141], " src_a6_ ", " acc_0_ "\n" \
286 _UK_MFMA_ " " acc_0_ ", acc[142:143], " src_a7_ ", " acc_0_ "\n" \
288 _UK_MFMA_ " " acc_1_ ", acc[128:129], " src_a8_ ", " acc_1_ "\n" \
289 _UK_MFMA_ " " acc_1_ ", acc[130:131], " src_a9_ ", " acc_1_ "\n" \
290 "buffer_load_dwordx4 acc[8:11], %[v_os_b0], " s_base_b_ ", 0 offen offset:2048 \n" \
291 _UK_MFMA_ " " acc_1_ ", acc[132:133], " src_a10_ ", " acc_1_ "\n" \
292 _UK_MFMA_ " " acc_1_ ", acc[134:135], " src_a11_ ", " acc_1_ "\n" \
294 _UK_MFMA_ " " acc_1_ ", acc[136:137], " src_a12_ ", " acc_1_ "\n" \
295 _UK_MFMA_ " " acc_1_ ", acc[138:139], " src_a13_ ", " acc_1_ "\n" \
296 "buffer_load_dwordx4 acc[12:15], %[v_os_b0], " s_base_b_ ", 0 offen offset:3072 \n" \
297 _UK_MFMA_ " " acc_1_ ", acc[140:141], " src_a14_ ", " acc_1_ "\n" \
298 _UK_MFMA_ " " acc_1_ ", acc[142:143], " src_a15_ ", " acc_1_ "\n" \
300 _UK_MFMA_ " " acc_2_ ", acc[144:145], " src_a0_ ", " acc_2_ "\n" \
301 _UK_MFMA_ " " acc_2_ ", acc[146:147], " src_a1_ ", " acc_2_ "\n" \
302 "buffer_load_dwordx4 acc[16:19], %[v_os_b1], " s_base_b_ ", 0 offen \n" \
303 _UK_MFMA_ " " acc_2_ ", acc[148:149], " src_a2_ ", " acc_2_ "\n" \
304 _UK_MFMA_ " " acc_2_ ", acc[150:151], " src_a3_ ", " acc_2_ "\n" \
306 _UK_MFMA_ " " acc_2_ ", acc[152:153], " src_a4_ ", " acc_2_ "\n" \
307 _UK_MFMA_ " " acc_2_ ", acc[154:155], " src_a5_ ", " acc_2_ "\n" \
308 "buffer_load_dwordx4 acc[20:23], %[v_os_b1], " s_base_b_ ", 0 offen offset:1024 \n" \
309 _UK_MFMA_ " " acc_2_ ", acc[156:157], " src_a6_ ", " acc_2_ "\n" \
310 _UK_MFMA_ " " acc_2_ ", acc[158:159], " src_a7_ ", " acc_2_ "\n" \
312 _UK_MFMA_ " " acc_3_ ", acc[144:145], " src_a8_ ", " acc_3_ "\n" \
313 _UK_MFMA_ " " acc_3_ ", acc[146:147], " src_a9_ ", " acc_3_ "\n" \
314 "buffer_load_dwordx4 acc[24:27], %[v_os_b1], " s_base_b_ ", 0 offen offset:2048 \n" \
315 _UK_MFMA_ " " acc_3_ ", acc[148:149], " src_a10_ ", " acc_3_ "\n" \
316 _UK_MFMA_ " " acc_3_ ", acc[150:151], " src_a11_ ", " acc_3_ "\n" \
318 _UK_MFMA_ " " acc_3_ ", acc[152:153], " src_a12_ ", " acc_3_ "\n" \
319 _UK_MFMA_ " " acc_3_ ", acc[154:155], " src_a13_ ", " acc_3_ "\n" \
320 "buffer_load_dwordx4 acc[28:31], %[v_os_b1], " s_base_b_ ", 0 offen offset:3072 \n" \
321 _UK_MFMA_ " " acc_3_ ", acc[156:157], " src_a14_ ", " acc_3_ "\n" \
322 _UK_MFMA_ " " acc_3_ ", acc[158:159], " src_a15_ ", " acc_3_ "\n" \
325 _UK_MFMA_ " " acc_4_ ", acc[160:161], " src_a0_ ", " acc_4_ "\n" \
326 _UK_MFMA_ " " acc_4_ ", acc[162:163], " src_a1_ ", " acc_4_ "\n" \
327 "buffer_load_dwordx4 acc[32:35], %[v_os_b2], " s_base_b_ ", 0 offen \n" \
328 _UK_MFMA_ " " acc_4_ ", acc[164:165], " src_a2_ ", " acc_4_ "\n" \
329 _UK_MFMA_ " " acc_4_ ", acc[166:167], " src_a3_ ", " acc_4_ "\n" \
331 _UK_MFMA_ " " acc_4_ ", acc[168:169], " src_a4_ ", " acc_4_ "\n" \
332 _UK_MFMA_ " " acc_4_ ", acc[170:171], " src_a5_ ", " acc_4_ "\n" \
333 "buffer_load_dwordx4 acc[36:39], %[v_os_b2], " s_base_b_ ", 0 offen offset:1024 \n" \
334 _UK_MFMA_ " " acc_4_ ", acc[172:173], " src_a6_ ", " acc_4_ "\n" \
335 _UK_MFMA_ " " acc_4_ ", acc[174:175], " src_a7_ ", " acc_4_ "\n" \
337 _UK_MFMA_ " " acc_5_ ", acc[160:161], " src_a8_ ", " acc_5_ "\n" \
338 _UK_MFMA_ " " acc_5_ ", acc[162:163], " src_a9_ ", " acc_5_ "\n" \
339 "buffer_load_dwordx4 acc[40:43], %[v_os_b2], " s_base_b_ ", 0 offen offset:2048 \n" \
340 _UK_MFMA_ " " acc_5_ ", acc[164:165], " src_a10_ ", " acc_5_ "\n" \
341 _UK_MFMA_ " " acc_5_ ", acc[166:167], " src_a11_ ", " acc_5_ "\n" \
343 _UK_MFMA_ " " acc_5_ ", acc[168:169], " src_a12_ ", " acc_5_ "\n" \
344 _UK_MFMA_ " " acc_5_ ", acc[170:171], " src_a13_ ", " acc_5_ "\n" \
345 "buffer_load_dwordx4 acc[44:47], %[v_os_b2], " s_base_b_ ", 0 offen offset:3072 \n" \
346 _UK_MFMA_ " " acc_5_ ", acc[172:173], " src_a14_ ", " acc_5_ "\n" \
347 _UK_MFMA_ " " acc_5_ ", acc[174:175], " src_a15_ ", " acc_5_ "\n" \
349 _UK_MFMA_ " " acc_6_ ", acc[176:177], " src_a0_ ", " acc_6_ "\n" \
350 _UK_MFMA_ " " acc_6_ ", acc[178:179], " src_a1_ ", " acc_6_ "\n" \
351 "buffer_load_dwordx4 acc[48:51], %[v_os_b3], " s_base_b_ ", 0 offen \n" \
352 _UK_MFMA_ " " acc_6_ ", acc[180:181], " src_a2_ ", " acc_6_ "\n" \
353 _UK_MFMA_ " " acc_6_ ", acc[182:183], " src_a3_ ", " acc_6_ "\n" \
355 _UK_MFMA_ " " acc_6_ ", acc[184:185], " src_a4_ ", " acc_6_ "\n" \
356 _UK_MFMA_ " " acc_6_ ", acc[186:187], " src_a5_ ", " acc_6_ "\n" \
357 "buffer_load_dwordx4 acc[52:55], %[v_os_b3], " s_base_b_ ", 0 offen offset:1024 \n" \
358 _UK_MFMA_ " " acc_6_ ", acc[188:189], " src_a6_ ", " acc_6_ "\n" \
359 _UK_MFMA_ " " acc_6_ ", acc[190:191], " src_a7_ ", " acc_6_ "\n" \
361 _UK_MFMA_ " " acc_7_ ", acc[176:177], " src_a8_ ", " acc_7_ "\n" \
362 _UK_MFMA_ " " acc_7_ ", acc[178:179], " src_a9_ ", " acc_7_ "\n" \
363 "buffer_load_dwordx4 acc[56:59], %[v_os_b3], " s_base_b_ ", 0 offen offset:2048 \n" \
364 _UK_MFMA_ " " acc_7_ ", acc[180:181], " src_a10_ ", " acc_7_ "\n" \
365 _UK_MFMA_ " " acc_7_ ", acc[182:183], " src_a11_ ", " acc_7_ "\n" \
367 _UK_MFMA_ " " acc_7_ ", acc[184:185], " src_a12_ ", " acc_7_ "\n" \
368 _UK_MFMA_ " " acc_7_ ", acc[186:187], " src_a13_ ", " acc_7_ "\n" \
369 "buffer_load_dwordx4 acc[60:63], %[v_os_b3], " s_base_b_ ", 0 offen offset:3072 \n" \
370 _UK_MFMA_ " " acc_7_ ", acc[188:189], " src_a14_ ", " acc_7_ "\n" \
371 _UK_MFMA_ " " acc_7_ ", acc[190:191], " src_a15_ ", " acc_7_ "\n" \
374 _UK_MFMA_ " " acc_8_ ", acc[192:193], " src_a0_ ", " acc_8_ "\n" \
375 _UK_MFMA_ " " acc_8_ ", acc[194:195], " src_a1_ ", " acc_8_ "\n" \
376 "buffer_load_dwordx4 acc[64:67], %[v_os_b4], " s_base_b_ ", 0 offen \n" \
377 _UK_MFMA_ " " acc_8_ ", acc[196:197], " src_a2_ ", " acc_8_ "\n" \
378 _UK_MFMA_ " " acc_8_ ", acc[198:199], " src_a3_ ", " acc_8_ "\n" \
379 _UK_MFMA_ " " acc_8_ ", acc[200:201], " src_a4_ ", " acc_8_ "\n" \
380 _UK_MFMA_ " " acc_8_ ", acc[202:203], " src_a5_ ", " acc_8_ "\n" \
381 "buffer_load_dwordx4 acc[68:71], %[v_os_b4], " s_base_b_ ", 0 offen offset:1024 \n" \
382 _UK_MFMA_ " " acc_8_ ", acc[204:205], " src_a6_ ", " acc_8_ "\n" \
383 _UK_MFMA_ " " acc_8_ ", acc[206:207], " src_a7_ ", " acc_8_ "\n" \
384 _UK_MFMA_ " " acc_9_ ", acc[192:193], " src_a8_ ", " acc_9_ "\n" \
385 _UK_MFMA_ " " acc_9_ ", acc[194:195], " src_a9_ ", " acc_9_ "\n" \
386 "buffer_load_dwordx4 acc[72:75], %[v_os_b4], " s_base_b_ ", 0 offen offset:2048 \n" \
387 _UK_MFMA_ " " acc_9_ ", acc[196:197], " src_a10_ ", " acc_9_ "\n" \
388 _UK_MFMA_ " " acc_9_ ", acc[198:199], " src_a11_ ", " acc_9_ "\n" \
389 _UK_MFMA_ " " acc_9_ ", acc[200:201], " src_a12_ ", " acc_9_ "\n" \
390 _UK_MFMA_ " " acc_9_ ", acc[202:203], " src_a13_ ", " acc_9_ "\n" \
391 "buffer_load_dwordx4 acc[76:79], %[v_os_b4], " s_base_b_ ", 0 offen offset:3072 \n" \
392 _UK_MFMA_ " " acc_9_ ", acc[204:205], " src_a14_ ", " acc_9_ "\n" \
393 _UK_MFMA_ " " acc_9_ ", acc[206:207], " src_a15_ ", " acc_9_ "\n" \
394 _UK_MFMA_ " " acc_10_ ", acc[208:209], " src_a0_ ", " acc_10_ "\n" \
395 _UK_MFMA_ " " acc_10_ ", acc[210:211], " src_a1_ ", " acc_10_ "\n" \
396 "buffer_load_dwordx4 acc[80:83], %[v_os_b5], " s_base_b_ ", 0 offen \n" \
397 _UK_MFMA_ " " acc_10_ ", acc[212:213], " src_a2_ ", " acc_10_ "\n" \
398 _UK_MFMA_ " " acc_10_ ", acc[214:215], " src_a3_ ", " acc_10_ "\n" \
399 _UK_MFMA_ " " acc_10_ ", acc[216:217], " src_a4_ ", " acc_10_ "\n" \
400 _UK_MFMA_ " " acc_10_ ", acc[218:219], " src_a5_ ", " acc_10_ "\n" \
401 "buffer_load_dwordx4 acc[84:87], %[v_os_b5], " s_base_b_ ", 0 offen offset:1024 \n" \
402 _UK_MFMA_ " " acc_10_ ", acc[220:221], " src_a6_ ", " acc_10_ "\n" \
403 _UK_MFMA_ " " acc_10_ ", acc[222:223], " src_a7_ ", " acc_10_ "\n" \
404 _UK_MFMA_ " " acc_11_ ", acc[208:209], " src_a8_ ", " acc_11_ "\n" \
405 _UK_MFMA_ " " acc_11_ ", acc[210:211], " src_a9_ ", " acc_11_ "\n" \
406 "buffer_load_dwordx4 acc[88:91], %[v_os_b5], " s_base_b_ ", 0 offen offset:2048 \n" \
407 _UK_MFMA_ " " acc_11_ ", acc[212:213], " src_a10_ ", " acc_11_ "\n" \
408 _UK_MFMA_ " " acc_11_ ", acc[214:215], " src_a11_ ", " acc_11_ "\n" \
409 _UK_MFMA_ " " acc_11_ ", acc[216:217], " src_a12_ ", " acc_11_ "\n" \
410 _UK_MFMA_ " " acc_11_ ", acc[218:219], " src_a13_ ", " acc_11_ "\n" \
411 "buffer_load_dwordx4 acc[92:95], %[v_os_b5], " s_base_b_ ", 0 offen offset:3072 \n" \
412 _UK_MFMA_ " " acc_11_ ", acc[220:221], " src_a14_ ", " acc_11_ "\n" \
413 _UK_MFMA_ " " acc_11_ ", acc[222:223], " src_a15_ ", " acc_11_ "\n" \
415 _UK_MFMA_ " " acc_12_ ", acc[224:225], " src_a0_ ", " acc_12_ "\n" \
416 _UK_MFMA_ " " acc_12_ ", acc[226:227], " src_a1_ ", " acc_12_ "\n" \
417 "buffer_load_dwordx4 acc[96:99], %[v_os_b6], " s_base_b_ ", 0 offen \n" \
418 _UK_MFMA_ " " acc_12_ ", acc[228:229], " src_a2_ ", " acc_12_ "\n" \
419 _UK_MFMA_ " " acc_12_ ", acc[230:231], " src_a3_ ", " acc_12_ "\n" \
420 _UK_MFMA_ " " acc_12_ ", acc[232:233], " src_a4_ ", " acc_12_ "\n" \
421 _UK_MFMA_ " " acc_12_ ", acc[234:235], " src_a5_ ", " acc_12_ "\n" \
422 "buffer_load_dwordx4 acc[100:103], %[v_os_b6], " s_base_b_ ", 0 offen offset:1024 \n" \
423 _UK_MFMA_ " " acc_12_ ", acc[236:237], " src_a6_ ", " acc_12_ "\n" \
424 _UK_MFMA_ " " acc_12_ ", acc[238:239], " src_a7_ ", " acc_12_ "\n" \
425 _UK_MFMA_ " " acc_13_ ", acc[224:225], " src_a8_ ", " acc_13_ "\n" \
426 _UK_MFMA_ " " acc_13_ ", acc[226:227], " src_a9_ ", " acc_13_ "\n" \
427 "buffer_load_dwordx4 acc[104:107], %[v_os_b6], " s_base_b_ ", 0 offen offset:2048 \n" \
428 _UK_MFMA_ " " acc_13_ ", acc[228:229], " src_a10_ ", " acc_13_ "\n" \
429 _UK_MFMA_ " " acc_13_ ", acc[230:231], " src_a11_ ", " acc_13_ "\n" \
430 _UK_MFMA_ " " acc_13_ ", acc[232:233], " src_a12_ ", " acc_13_ "\n" \
431 _UK_MFMA_ " " acc_13_ ", acc[234:235], " src_a13_ ", " acc_13_ "\n" \
432 "buffer_load_dwordx4 acc[108:111], %[v_os_b6], " s_base_b_ ", 0 offen offset:3072 \n" \
433 _UK_MFMA_ " " acc_13_ ", acc[236:237], " src_a14_ ", " acc_13_ "\n" \
434 _UK_MFMA_ " " acc_13_ ", acc[238:239], " src_a15_ ", " acc_13_ "\n" \
435 _UK_MFMA_ " " acc_14_ ", acc[240:241], " src_a0_ ", " acc_14_ "\n" \
436 _UK_MFMA_ " " acc_14_ ", acc[242:243], " src_a1_ ", " acc_14_ "\n" \
437 "buffer_load_dwordx4 acc[112:115], %[v_os_b7], " s_base_b_ ", 0 offen \n" \
438 _UK_MFMA_ " " acc_14_ ", acc[244:245], " src_a2_ ", " acc_14_ "\n" \
439 _UK_MFMA_ " " acc_14_ ", acc[246:247], " src_a3_ ", " acc_14_ "\n" \
440 _UK_MFMA_ " " acc_14_ ", acc[248:249], " src_a4_ ", " acc_14_ "\n" \
441 _UK_MFMA_ " " acc_14_ ", acc[250:251], " src_a5_ ", " acc_14_ "\n" \
442 "buffer_load_dwordx4 acc[116:119], %[v_os_b7], " s_base_b_ ", 0 offen offset:1024 \n" \
443 _UK_MFMA_ " " acc_14_ ", acc[252:253], " src_a6_ ", " acc_14_ "\n" \
444 _UK_MFMA_ " " acc_14_ ", acc[254:255], " src_a7_ ", " acc_14_ "\n" \
445 _UK_MFMA_ " " acc_15_ ", acc[240:241], " src_a8_ ", " acc_15_ "\n" \
446 _UK_MFMA_ " " acc_15_ ", acc[242:243], " src_a9_ ", " acc_15_ "\n" \
447 "buffer_load_dwordx4 acc[120:123], %[v_os_b7], " s_base_b_ ", 0 offen offset:2048 \n" \
448 _UK_MFMA_ " " acc_15_ ", acc[244:245], " src_a10_ ", " acc_15_ "\n" \
449 _UK_MFMA_ " " acc_15_ ", acc[246:247], " src_a11_ ", " acc_15_ "\n" \
450 _UK_MFMA_ " " acc_15_ ", acc[248:249], " src_a12_ ", " acc_15_ "\n" \
451 _UK_MFMA_ " " acc_15_ ", acc[250:251], " src_a13_ ", " acc_15_ "\n" \
452 "buffer_load_dwordx4 acc[124:127], %[v_os_b7], " s_base_b_ ", 0 offen offset:3072 \n" \
453 _UK_MFMA_ " " acc_15_ ", acc[252:253], " src_a14_ ", " acc_15_ "\n" \
454 _UK_MFMA_ " " acc_15_ ", acc[254:255], " src_a15_ ", " acc_15_ "\n"
457#if CK_TILE_FLATMM_UK_2B == 0
458 "s_mov_b32 s16, %[s_res_a0]\n"
459 "s_mov_b32 s17, %[s_res_a1]\n"
460 "s_mov_b32 s18, %[s_res_a2]\n"
461 "s_mov_b32 s19, %[s_res_a3]\n"
462 "s_mov_b32 s20, %[s_res_b0]\n"
463 "s_mov_b32 s21, %[s_res_b1]\n"
464 "s_mov_b32 s22, %[s_res_b2]\n"
465 "s_mov_b32 s23, %[s_res_b3]\n"
467 "s_add_u32 m0, 0, %[s_m0_init]\n"
476 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond\n"
477 "s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond\n"
478 "s_add_u32 s16, s86, s16 ; move a with cond\n"
479 "s_addc_u32 s17, 0, s17 ; move a with cond\n"
489 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond\n"
490 "s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond\n"
491 "s_add_u32 s16, s86, s16 ; move a with cond\n"
492 "s_addc_u32 s17, 0, s17 ; move a with cond\n"
494 "buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
495 "buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
496 "buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
497 "buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
498 "buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n"
499 "buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
500 "buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
501 "buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
502 "buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n"
503 "buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
504 "buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
505 "buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
506 "buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n"
507 "buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
508 "buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
509 "buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
510 "buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n"
511 "buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
512 "buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
513 "buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
514 "buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n"
515 "buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
516 "buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
517 "buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
518 "buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n"
519 "buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
520 "buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
521 "buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
522 "buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n"
523 "buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
524 "buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
525 "buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n"
526 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond\n"
527 "s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond\n"
528 "s_add_u32 s20, s86, s20 ; move b with cond\n"
529 "s_addc_u32 s21, 0, s21 ; move b with cond\n"
530 "s_waitcnt vmcnt(40)\n"
540 "L_start_uk_g1u0_32x512x128_16x16x16_%=:\n"
543 "v[64:65]",
"v[66:67]",
"v[68:69]",
"v[70:71]",
"v[72:73]",
"v[74:75]",
"v[76:77]",
"v[78:79]",
544 "v[80:81]",
"v[82:83]",
"v[84:85]",
"v[86:87]",
"v[88:89]",
"v[90:91]",
"v[92:93]",
"v[94:95]",
545 "s_waitcnt vmcnt(24) & lgkmcnt(0)\ns_barrier\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
546 "%[v_acc_0]",
"%[v_acc_1]",
"%[v_acc_2]",
"%[v_acc_3]",
"%[v_acc_4]",
"%[v_acc_5]",
"%[v_acc_6]",
"%[v_acc_7]",
547 "%[v_acc_8]",
"%[v_acc_9]",
"%[v_acc_10]",
"%[v_acc_11]",
"%[v_acc_12]",
"%[v_acc_13]",
"%[v_acc_14]",
"%[v_acc_15]",
549 "s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n"
550 "s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
551 "s_cbranch_scc0 L_end_uk_g1u0_32x512x128_16x16x16_%=\n"
552 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n"
553 "s_cselect_b32 s86, %[s_tile_os_a], 0 \n"
554 "s_add_u32 s16, s86, s16 \n"
555 "s_addc_u32 s17, 0, s17 \n"
556 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
557 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
558 "s_add_u32 s20, s86, s20 \n"
559 "s_addc_u32 s21, 0, s21 \n"
560 ";------------------------------------------ \n"
563 "v[96:97]",
"v[98:99]",
"v[100:101]",
"v[102:103]",
"v[104:105]",
"v[106:107]",
"v[108:109]",
"v[110:111]",
564 "v[112:113]",
"v[114:115]",
"v[116:117]",
"v[118:119]",
"v[120:121]",
"v[122:123]",
"v[124:125]",
"v[126:127]",
565 "s_waitcnt vmcnt(24) & lgkmcnt(0)\ns_barrier\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
566 "%[v_acc_0]",
"%[v_acc_1]",
"%[v_acc_2]",
"%[v_acc_3]",
"%[v_acc_4]",
"%[v_acc_5]",
"%[v_acc_6]",
"%[v_acc_7]",
567 "%[v_acc_8]",
"%[v_acc_9]",
"%[v_acc_10]",
"%[v_acc_11]",
"%[v_acc_12]",
"%[v_acc_13]",
"%[v_acc_14]",
"%[v_acc_15]",
569 "s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n"
570 "s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
571 "s_cbranch_scc0 L_end_uk_g1u0_32x512x128_16x16x16_%=\n"
572 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n"
573 "s_cselect_b32 s86, %[s_tile_os_a], 0 \n"
574 "s_add_u32 s16, s86, s16 \n"
575 "s_addc_u32 s17, 0, s17 \n"
576 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
577 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
578 "s_add_u32 s20, s86, s20 \n"
579 "s_addc_u32 s21, 0, s21 \n"
580 "s_branch L_start_uk_g1u0_32x512x128_16x16x16_%=\n"
581 "L_end_uk_g1u0_32x512x128_16x16x16_%=:\n"
584 "s_mov_b32 s16, %[s_res_a0]\n"
585 "s_mov_b32 s17, %[s_res_a1]\n"
586 "s_mov_b32 s18, %[s_res_a2]\n"
587 "s_mov_b32 s19, %[s_res_a3]\n"
588 "s_mov_b32 s20, %[s_res_b0]\n"
589 "s_mov_b32 s21, %[s_res_b1]\n"
590 "s_mov_b32 s22, %[s_res_b2]\n"
591 "s_mov_b32 s23, %[s_res_b3]\n"
592 "s_mov_b32 s24, %[s_res_b4]\n"
593 "s_mov_b32 s25, %[s_res_b5]\n"
594 "s_mov_b32 s26, %[s_res_b6]\n"
595 "s_mov_b32 s27, %[s_res_b7]\n"
597 "s_add_u32 m0, 0, %[s_m0_init]\n"
606 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move a with cond\n"
607 "s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond\n"
608 "s_add_u32 s16, s86, s16 ; move a with cond\n"
609 "s_addc_u32 s17, 0, s17 ; move a with cond\n"
619 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond\n"
620 "s_cselect_b32 s86, %[s_tile_os_a], 0 ; move a with cond\n"
621 "s_add_u32 s16, s86, s16 ; move a with cond\n"
622 "s_addc_u32 s17, 0, s17 ; move a with cond\n"
624 "buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[20:23], 0 offen \n"
625 "buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[20:23], 0 offen offset:1024 \n"
626 "buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[20:23], 0 offen offset:2048 \n"
627 "buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[20:23], 0 offen offset:3072 \n"
628 "buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[20:23], 0 offen \n"
629 "buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[20:23], 0 offen offset:1024 \n"
630 "buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[20:23], 0 offen offset:2048 \n"
631 "buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[20:23], 0 offen offset:3072 \n"
632 "buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[20:23], 0 offen \n"
633 "buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[20:23], 0 offen offset:1024 \n"
634 "buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[20:23], 0 offen offset:2048 \n"
635 "buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[20:23], 0 offen offset:3072 \n"
636 "buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[20:23], 0 offen \n"
637 "buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[20:23], 0 offen offset:1024 \n"
638 "buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[20:23], 0 offen offset:2048 \n"
639 "buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[20:23], 0 offen offset:3072 \n"
640 "buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[20:23], 0 offen \n"
641 "buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[20:23], 0 offen offset:1024 \n"
642 "buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[20:23], 0 offen offset:2048 \n"
643 "buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[20:23], 0 offen offset:3072 \n"
644 "buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[20:23], 0 offen \n"
645 "buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[20:23], 0 offen offset:1024 \n"
646 "buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[20:23], 0 offen offset:2048 \n"
647 "buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[20:23], 0 offen offset:3072 \n"
648 "buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[20:23], 0 offen \n"
649 "buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[20:23], 0 offen offset:1024 \n"
650 "buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[20:23], 0 offen offset:2048 \n"
651 "buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[20:23], 0 offen offset:3072 \n"
652 "buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[20:23], 0 offen \n"
653 "buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[20:23], 0 offen offset:1024 \n"
654 "buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[20:23], 0 offen offset:2048 \n"
655 "buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[20:23], 0 offen offset:3072 \n"
656 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond\n"
657 "s_cselect_b32 s86, %[s_tile_os_b], 0 ; move b with cond\n"
658 "s_add_u32 s20, s86, s20 ; move b with cond\n"
659 "s_addc_u32 s21, 0, s21 ; move b with cond\n"
660 "s_waitcnt vmcnt(40)\n"
670 "L_start_uk_g1u1_32x512x128_16x16x16_%=:\n"
673 "v[64:65]",
"v[66:67]",
"v[68:69]",
"v[70:71]",
"v[72:73]",
"v[74:75]",
"v[76:77]",
"v[78:79]",
674 "v[80:81]",
"v[82:83]",
"v[84:85]",
"v[86:87]",
"v[88:89]",
"v[90:91]",
"v[92:93]",
"v[94:95]",
675 "s_waitcnt vmcnt(24) & lgkmcnt(0)\ns_barrier\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
676 "%[v_acc_0]",
"%[v_acc_1]",
"%[v_acc_2]",
"%[v_acc_3]",
"%[v_acc_4]",
"%[v_acc_5]",
"%[v_acc_6]",
"%[v_acc_7]",
677 "%[v_acc_8]",
"%[v_acc_9]",
"%[v_acc_10]",
"%[v_acc_11]",
"%[v_acc_12]",
"%[v_acc_13]",
"%[v_acc_14]",
"%[v_acc_15]",
681 "v[64:65]",
"v[66:67]",
"v[68:69]",
"v[70:71]",
"v[72:73]",
"v[74:75]",
"v[76:77]",
"v[78:79]",
682 "v[80:81]",
"v[82:83]",
"v[84:85]",
"v[86:87]",
"v[88:89]",
"v[90:91]",
"v[92:93]",
"v[94:95]",
683 "s_waitcnt vmcnt(24)\ns_barrier\n",
"s_waitcnt vmcnt(24)\n",
"s_waitcnt vmcnt(24)\n",
"s_waitcnt vmcnt(24)\n",
684 "%[v_acc_16]",
"%[v_acc_17]",
"%[v_acc_18]",
"%[v_acc_19]",
"%[v_acc_20]",
"%[v_acc_21]",
"%[v_acc_22]",
"%[v_acc_23]",
685 "%[v_acc_24]",
"%[v_acc_25]",
"%[v_acc_26]",
"%[v_acc_27]",
"%[v_acc_28]",
"%[v_acc_29]",
"%[v_acc_30]",
"%[v_acc_31]",
687 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
688 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
689 "s_add_u32 s24, s86, s24 \n"
690 "s_addc_u32 s25, 0, s25 \n"
691 "s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n"
692 "s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
693 "s_cbranch_scc0 L_end_uk_g1u1_32x512x128_16x16x16_%=\n"
694 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n"
695 "s_cselect_b32 s86, %[s_tile_os_a], 0 \n"
696 "s_add_u32 s16, s86, s16 \n"
697 "s_addc_u32 s17, 0, s17 \n"
698 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
699 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
700 "s_add_u32 s20, s86, s20 \n"
701 "s_addc_u32 s21, 0, s21 \n"
702 ";------------------------------------------ \n"
705 "v[96:97]",
"v[98:99]",
"v[100:101]",
"v[102:103]",
"v[104:105]",
"v[106:107]",
"v[108:109]",
"v[110:111]",
706 "v[112:113]",
"v[114:115]",
"v[116:117]",
"v[118:119]",
"v[120:121]",
"v[122:123]",
"v[124:125]",
"v[126:127]",
707 "s_waitcnt vmcnt(24) & lgkmcnt(0)\ns_barrier\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
"s_waitcnt vmcnt(32)\n",
708 "%[v_acc_0]",
"%[v_acc_1]",
"%[v_acc_2]",
"%[v_acc_3]",
"%[v_acc_4]",
"%[v_acc_5]",
"%[v_acc_6]",
"%[v_acc_7]",
709 "%[v_acc_8]",
"%[v_acc_9]",
"%[v_acc_10]",
"%[v_acc_11]",
"%[v_acc_12]",
"%[v_acc_13]",
"%[v_acc_14]",
"%[v_acc_15]",
713 "v[96:97]",
"v[98:99]",
"v[100:101]",
"v[102:103]",
"v[104:105]",
"v[106:107]",
"v[108:109]",
"v[110:111]",
714 "v[112:113]",
"v[114:115]",
"v[116:117]",
"v[118:119]",
"v[120:121]",
"v[122:123]",
"v[124:125]",
"v[126:127]",
715 "s_waitcnt vmcnt(24)\ns_barrier\n",
"s_waitcnt vmcnt(24)\n",
"s_waitcnt vmcnt(24)\n",
"s_waitcnt vmcnt(24)\n",
716 "%[v_acc_16]",
"%[v_acc_17]",
"%[v_acc_18]",
"%[v_acc_19]",
"%[v_acc_20]",
"%[v_acc_21]",
"%[v_acc_22]",
"%[v_acc_23]",
717 "%[v_acc_24]",
"%[v_acc_25]",
"%[v_acc_26]",
"%[v_acc_27]",
"%[v_acc_28]",
"%[v_acc_29]",
"%[v_acc_30]",
"%[v_acc_31]",
719 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
720 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
721 "s_add_u32 s24, s86, s24 \n"
722 "s_addc_u32 s25, 0, s25 \n"
723 "s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 \n"
724 "s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
725 "s_cbranch_scc0 L_end_uk_g1u1_32x512x128_16x16x16_%=\n"
726 "s_cmp_gt_i32 %[s_loop_cnt] 2 ; move a with cond \n"
727 "s_cselect_b32 s86, %[s_tile_os_a], 0 \n"
728 "s_add_u32 s16, s86, s16 \n"
729 "s_addc_u32 s17, 0, s17 \n"
730 "s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
731 "s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
732 "s_add_u32 s20, s86, s20 \n"
733 "s_addc_u32 s21, 0, s21 \n"
734 "s_branch L_start_uk_g1u1_32x512x128_16x16x16_%=\n"
735 "L_end_uk_g1u1_32x512x128_16x16x16_%=:\n"
746#undef _UK_GLD_A7_AND_L1
747#undef _UK_GLD_A7_AND_L0
772#undef CK_TILE_FLATMM_UK_2B
773#undef CK_TILE_FLATMM_UK_MFMA
#define _UK_GLD_A7_AND_L0
#define _UK_PIPELINE_0(gld_a0_, gld_a1_, gld_a2_, gld_a3_, gld_a4_, gld_a5_, gld_a6_, gld_a7_, sld_a0_, sld_a1_, sld_a2_, sld_a3_, sld_a4_, sld_a5_, sld_a6_, sld_a7_, src_a0_, src_a1_, src_a2_, src_a3_, src_a4_, src_a5_, src_a6_, src_a7_, src_a8_, src_a9_, src_a10_, src_a11_, src_a12_, src_a13_, src_a14_, src_a15_, fence_0_, fence_1_, fence_2_, fence_3_, acc_0_, acc_1_, acc_2_, acc_3_, acc_4_, acc_5_, acc_6_, acc_7_, acc_8_, acc_9_, acc_10_, acc_11_, acc_12_, acc_13_, acc_14_, acc_15_, s_base_b_)
#define _UK_GLD_A7_AND_L1
#define _UK_PIPELINE_1(gld_a0_, gld_a1_, gld_a2_, gld_a3_, gld_a4_, gld_a5_, gld_a6_, gld_a7_, sld_a0_, sld_a1_, sld_a2_, sld_a3_, sld_a4_, sld_a5_, sld_a6_, sld_a7_, src_a0_, src_a1_, src_a2_, src_a3_, src_a4_, src_a5_, src_a6_, src_a7_, src_a8_, src_a9_, src_a10_, src_a11_, src_a12_, src_a13_, src_a14_, src_a15_, fence_0_, fence_1_, fence_2_, fence_3_, acc_0_, acc_1_, acc_2_, acc_3_, acc_4_, acc_5_, acc_6_, acc_7_, acc_8_, acc_9_, acc_10_, acc_11_, acc_12_, acc_13_, acc_14_, acc_15_, s_base_b_)