@@ -3121,6 +3121,159 @@ bb:
3121
3121
ret void
3122
3122
}
3123
3123
3124
+ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64 (ptr addrspace (1 ) %arg ) #0 {
3125
+ ; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3126
+ ; NOLIT-SRCC: ; %bb.0: ; %bb
3127
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3128
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64
3129
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64
3130
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64
3131
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64
3132
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64
3133
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64
3134
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64
3135
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64
3136
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64
3137
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64
3138
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64
3139
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64
3140
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64
3141
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64
3142
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64
3143
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
3144
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3145
+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3146
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3147
+ ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
3148
+ ; NOLIT-SRCC-NEXT: s_nop 7
3149
+ ; NOLIT-SRCC-NEXT: s_nop 1
3150
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3151
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3152
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3153
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3154
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3155
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3156
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3157
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3158
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3159
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3160
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3161
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3162
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3163
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3164
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3165
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3166
+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3167
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3168
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3169
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3170
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3171
+ ; NOLIT-SRCC-NEXT: s_endpgm
3172
+ ;
3173
+ ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3174
+ ; LIT-SRCC: ; %bb.0: ; %bb
3175
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3176
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3177
+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3178
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3179
+ ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3180
+ ; LIT-SRCC-NEXT: s_nop 7
3181
+ ; LIT-SRCC-NEXT: s_nop 1
3182
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3183
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3184
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3185
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3186
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3187
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3188
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3189
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3190
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3191
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3192
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3193
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3194
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3195
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3196
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3197
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3198
+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3199
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3200
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3201
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3202
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3203
+ ; LIT-SRCC-NEXT: s_endpgm
3204
+ ;
3205
+ ; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3206
+ ; GFX90A: ; %bb.0: ; %bb
3207
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3208
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 2
3209
+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3210
+ ; GFX90A-NEXT: s_nop 0
3211
+ ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3212
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3213
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3214
+ ; GFX90A-NEXT: s_nop 7
3215
+ ; GFX90A-NEXT: s_nop 0
3216
+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3217
+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3218
+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3219
+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3220
+ ; GFX90A-NEXT: s_endpgm
3221
+ ;
3222
+ ; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3223
+ ; GFX942: ; %bb.0: ; %bb
3224
+ ; GFX942-NEXT: v_mov_b32_e32 v0, 1
3225
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 2
3226
+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3227
+ ; GFX942-NEXT: s_nop 0
3228
+ ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3229
+ ; GFX942-NEXT: v_mov_b32_e32 v0, 0
3230
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3231
+ ; GFX942-NEXT: s_nop 7
3232
+ ; GFX942-NEXT: s_nop 0
3233
+ ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3234
+ ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3235
+ ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3236
+ ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3237
+ ; GFX942-NEXT: s_endpgm
3238
+ ;
3239
+ ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3240
+ ; GFX942-VGPR: ; %bb.0: ; %bb
3241
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
3242
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
3243
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3244
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3245
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3246
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
3247
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
3248
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
3249
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
3250
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
3251
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
3252
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
3253
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
3254
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
3255
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
3256
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
3257
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
3258
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
3259
+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3260
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
3261
+ ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
3262
+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3263
+ ; GFX942-VGPR-NEXT: s_nop 7
3264
+ ; GFX942-VGPR-NEXT: s_nop 1
3265
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3266
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3267
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3268
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3269
+ ; GFX942-VGPR-NEXT: s_endpgm
3270
+ bb:
3271
+ %in.1 = load <16 x i32 >, ptr addrspace (1 ) %arg
3272
+ %mai.1 = tail call <16 x i32 > @llvm.amdgcn.mfma.i32.16x16x4i8 (i32 1 , i32 2 , <16 x i32 > splat (i32 64 ), i32 1 , i32 2 , i32 3 )
3273
+ store <16 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3274
+ ret void
3275
+ }
3276
+
3124
3277
define amdgpu_kernel void @test_mfma_i32_4x4x4i8 (ptr addrspace (1 ) %arg ) #0 {
3125
3278
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
3126
3279
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -3239,6 +3392,200 @@ bb:
3239
3392
ret void
3240
3393
}
3241
3394
3395
+ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1 (ptr addrspace (1 ) %arg ) #0 {
3396
+ ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3397
+ ; NOLIT-SRCC: ; %bb.0: ; %bb
3398
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3399
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1
3400
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1
3401
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1
3402
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1
3403
+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3404
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3405
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3406
+ ; NOLIT-SRCC-NEXT: s_nop 0
3407
+ ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
3408
+ ; NOLIT-SRCC-NEXT: s_nop 3
3409
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3410
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3411
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3412
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3413
+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3414
+ ; NOLIT-SRCC-NEXT: s_nop 0
3415
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3416
+ ; NOLIT-SRCC-NEXT: s_endpgm
3417
+ ;
3418
+ ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3419
+ ; LIT-SRCC: ; %bb.0: ; %bb
3420
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3421
+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3422
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3423
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3424
+ ; LIT-SRCC-NEXT: s_nop 0
3425
+ ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
3426
+ ; LIT-SRCC-NEXT: s_nop 3
3427
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3428
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3429
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3430
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3431
+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3432
+ ; LIT-SRCC-NEXT: s_nop 0
3433
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3434
+ ; LIT-SRCC-NEXT: s_endpgm
3435
+ ;
3436
+ ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3437
+ ; GFX90A: ; %bb.0: ; %bb
3438
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3439
+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3440
+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3441
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0
3442
+ ; GFX90A-NEXT: s_nop 0
3443
+ ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3444
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3445
+ ; GFX90A-NEXT: s_nop 3
3446
+ ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3447
+ ; GFX90A-NEXT: s_endpgm
3448
+ ;
3449
+ ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3450
+ ; GFX942: ; %bb.0: ; %bb
3451
+ ; GFX942-NEXT: v_mov_b32_e32 v0, 1
3452
+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3453
+ ; GFX942-NEXT: v_mov_b32_e32 v2, 2
3454
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0
3455
+ ; GFX942-NEXT: s_nop 0
3456
+ ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3457
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3458
+ ; GFX942-NEXT: s_nop 3
3459
+ ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3460
+ ; GFX942-NEXT: s_endpgm
3461
+ ;
3462
+ ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3463
+ ; GFX942-VGPR: ; %bb.0: ; %bb
3464
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
3465
+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3466
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3467
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3468
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3469
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
3470
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3471
+ ; GFX942-VGPR-NEXT: s_nop 0
3472
+ ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
3473
+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3474
+ ; GFX942-VGPR-NEXT: s_nop 3
3475
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3476
+ ; GFX942-VGPR-NEXT: s_endpgm
3477
+ bb:
3478
+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %arg
3479
+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.mfma.i32.4x4x4i8 (i32 1 , i32 2 , <4 x i32 > splat (i32 1 ), i32 1 , i32 2 , i32 3 )
3480
+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3481
+ ret void
3482
+ }
3483
+
3484
+ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1 (ptr addrspace (1 ) %arg ) #0 {
3485
+ ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3486
+ ; NOLIT-SRCC: ; %bb.0:
3487
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3488
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3489
+ ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3490
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3491
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3492
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3493
+ ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3494
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3495
+ ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3496
+ ; NOLIT-SRCC-NEXT: s_nop 0
3497
+ ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3498
+ ; NOLIT-SRCC-NEXT: s_nop 3
3499
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3500
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3501
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3502
+ ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3503
+ ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3504
+ ; NOLIT-SRCC-NEXT: s_nop 0
3505
+ ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3506
+ ; NOLIT-SRCC-NEXT: s_endpgm
3507
+ ;
3508
+ ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3509
+ ; LIT-SRCC: ; %bb.0:
3510
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3511
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3512
+ ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3513
+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3514
+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3515
+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3516
+ ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3517
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3518
+ ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3519
+ ; LIT-SRCC-NEXT: s_nop 0
3520
+ ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3521
+ ; LIT-SRCC-NEXT: s_nop 3
3522
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3523
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3524
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3525
+ ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3526
+ ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3527
+ ; LIT-SRCC-NEXT: s_nop 0
3528
+ ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3529
+ ; LIT-SRCC-NEXT: s_endpgm
3530
+ ;
3531
+ ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3532
+ ; GFX90A: ; %bb.0:
3533
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
3534
+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
3535
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, 1
3536
+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3537
+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
3538
+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
3539
+ ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
3540
+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3541
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3542
+ ; GFX90A-NEXT: s_nop 0
3543
+ ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3544
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3545
+ ; GFX90A-NEXT: s_nop 3
3546
+ ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3547
+ ; GFX90A-NEXT: s_endpgm
3548
+ ;
3549
+ ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3550
+ ; GFX942: ; %bb.0:
3551
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
3552
+ ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
3553
+ ; GFX942-NEXT: v_mov_b32_e32 v1, 1
3554
+ ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3555
+ ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
3556
+ ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
3557
+ ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
3558
+ ; GFX942-NEXT: v_mov_b32_e32 v2, 2
3559
+ ; GFX942-NEXT: v_mov_b32_e32 v0, 0
3560
+ ; GFX942-NEXT: s_nop 0
3561
+ ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3562
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3563
+ ; GFX942-NEXT: s_nop 3
3564
+ ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3565
+ ; GFX942-NEXT: s_endpgm
3566
+ ;
3567
+ ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3568
+ ; GFX942-VGPR: ; %bb.0:
3569
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1
3570
+ ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3571
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41
3572
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3573
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3574
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3575
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
3576
+ ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3577
+ ; GFX942-VGPR-NEXT: s_nop 0
3578
+ ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
3579
+ ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3580
+ ; GFX942-VGPR-NEXT: s_nop 3
3581
+ ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3582
+ ; GFX942-VGPR-NEXT: s_endpgm
3583
+ %in.1 = load <4 x i32 >, ptr addrspace (1 ) %arg
3584
+ %mai.1 = tail call <4 x i32 > @llvm.amdgcn.mfma.i32.4x4x4i8 (i32 1 , i32 2 , <4 x i32 > splat (i32 65 ), i32 1 , i32 2 , i32 3 )
3585
+ store <4 x i32 > %mai.1 , ptr addrspace (1 ) %arg
3586
+ ret void
3587
+ }
3588
+
3242
3589
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc (ptr addrspace (1 ) %arg ) #0 {
3243
3590
; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
3244
3591
; NOLIT-SRCC: ; %bb.0: ; %bb
0 commit comments