Skip to content

Commit 4d8c839

Browse files
committed
AMDGPU: Add a few mfma test with immediate splat src2
1 parent bd7db75 commit 4d8c839

File tree

1 file changed

+347
-0
lines changed

1 file changed

+347
-0
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll

Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3121,6 +3121,159 @@ bb:
31213121
ret void
31223122
}
31233123

3124+
define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspace(1) %arg) #0 {
3125+
; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3126+
; NOLIT-SRCC: ; %bb.0: ; %bb
3127+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3128+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64
3129+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64
3130+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64
3131+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64
3132+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64
3133+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64
3134+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64
3135+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64
3136+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64
3137+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64
3138+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64
3139+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64
3140+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64
3141+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64
3142+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64
3143+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
3144+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3145+
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3146+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3147+
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
3148+
; NOLIT-SRCC-NEXT: s_nop 7
3149+
; NOLIT-SRCC-NEXT: s_nop 1
3150+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3151+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3152+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3153+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3154+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3155+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3156+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3157+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3158+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3159+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3160+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3161+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3162+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3163+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3164+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3165+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3166+
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3167+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3168+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3169+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3170+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3171+
; NOLIT-SRCC-NEXT: s_endpgm
3172+
;
3173+
; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3174+
; LIT-SRCC: ; %bb.0: ; %bb
3175+
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3176+
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3177+
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3178+
; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
3179+
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3180+
; LIT-SRCC-NEXT: s_nop 7
3181+
; LIT-SRCC-NEXT: s_nop 1
3182+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
3183+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
3184+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
3185+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
3186+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3187+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3188+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3189+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3190+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
3191+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
3192+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
3193+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
3194+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
3195+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
3196+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
3197+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
3198+
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3199+
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3200+
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3201+
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3202+
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3203+
; LIT-SRCC-NEXT: s_endpgm
3204+
;
3205+
; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3206+
; GFX90A: ; %bb.0: ; %bb
3207+
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3208+
; GFX90A-NEXT: v_mov_b32_e32 v1, 2
3209+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3210+
; GFX90A-NEXT: s_nop 0
3211+
; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3212+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3213+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3214+
; GFX90A-NEXT: s_nop 7
3215+
; GFX90A-NEXT: s_nop 0
3216+
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3217+
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3218+
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3219+
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3220+
; GFX90A-NEXT: s_endpgm
3221+
;
3222+
; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3223+
; GFX942: ; %bb.0: ; %bb
3224+
; GFX942-NEXT: v_mov_b32_e32 v0, 1
3225+
; GFX942-NEXT: v_mov_b32_e32 v1, 2
3226+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3227+
; GFX942-NEXT: s_nop 0
3228+
; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
3229+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
3230+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3231+
; GFX942-NEXT: s_nop 7
3232+
; GFX942-NEXT: s_nop 0
3233+
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
3234+
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
3235+
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
3236+
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3237+
; GFX942-NEXT: s_endpgm
3238+
;
3239+
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
3240+
; GFX942-VGPR: ; %bb.0: ; %bb
3241+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 1
3242+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
3243+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3244+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3245+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3246+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
3247+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
3248+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
3249+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
3250+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
3251+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
3252+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
3253+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
3254+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
3255+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
3256+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
3257+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v0
3258+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 2
3259+
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3260+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
3261+
; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v17, v18, v[0:15] cbsz:1 abid:2 blgp:3
3262+
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3263+
; GFX942-VGPR-NEXT: s_nop 7
3264+
; GFX942-VGPR-NEXT: s_nop 1
3265+
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
3266+
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
3267+
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
3268+
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
3269+
; GFX942-VGPR-NEXT: s_endpgm
3270+
bb:
3271+
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
3272+
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> splat (i32 64), i32 1, i32 2, i32 3)
3273+
store <16 x i32> %mai.1, ptr addrspace(1) %arg
3274+
ret void
3275+
}
3276+
31243277
define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
31253278
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
31263279
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -3239,6 +3392,200 @@ bb:
32393392
ret void
32403393
}
32413394

3395+
define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(1) %arg) #0 {
3396+
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3397+
; NOLIT-SRCC: ; %bb.0: ; %bb
3398+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3399+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1
3400+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1
3401+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1
3402+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1
3403+
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3404+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3405+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3406+
; NOLIT-SRCC-NEXT: s_nop 0
3407+
; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
3408+
; NOLIT-SRCC-NEXT: s_nop 3
3409+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3410+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3411+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3412+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3413+
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3414+
; NOLIT-SRCC-NEXT: s_nop 0
3415+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3416+
; NOLIT-SRCC-NEXT: s_endpgm
3417+
;
3418+
; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3419+
; LIT-SRCC: ; %bb.0: ; %bb
3420+
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
3421+
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3422+
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
3423+
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3424+
; LIT-SRCC-NEXT: s_nop 0
3425+
; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
3426+
; LIT-SRCC-NEXT: s_nop 3
3427+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3428+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3429+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3430+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3431+
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3432+
; LIT-SRCC-NEXT: s_nop 0
3433+
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3434+
; LIT-SRCC-NEXT: s_endpgm
3435+
;
3436+
; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3437+
; GFX90A: ; %bb.0: ; %bb
3438+
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
3439+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3440+
; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3441+
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
3442+
; GFX90A-NEXT: s_nop 0
3443+
; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3444+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3445+
; GFX90A-NEXT: s_nop 3
3446+
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3447+
; GFX90A-NEXT: s_endpgm
3448+
;
3449+
; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3450+
; GFX942: ; %bb.0: ; %bb
3451+
; GFX942-NEXT: v_mov_b32_e32 v0, 1
3452+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3453+
; GFX942-NEXT: v_mov_b32_e32 v2, 2
3454+
; GFX942-NEXT: v_mov_b32_e32 v1, 0
3455+
; GFX942-NEXT: s_nop 0
3456+
; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
3457+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3458+
; GFX942-NEXT: s_nop 3
3459+
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
3460+
; GFX942-NEXT: s_endpgm
3461+
;
3462+
; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
3463+
; GFX942-VGPR: ; %bb.0: ; %bb
3464+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
3465+
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3466+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3467+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3468+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3469+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2
3470+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3471+
; GFX942-VGPR-NEXT: s_nop 0
3472+
; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v5, v[0:3] cbsz:1 abid:2 blgp:3
3473+
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3474+
; GFX942-VGPR-NEXT: s_nop 3
3475+
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3476+
; GFX942-VGPR-NEXT: s_endpgm
3477+
bb:
3478+
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
3479+
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 1), i32 1, i32 2, i32 3)
3480+
store <4 x i32> %mai.1, ptr addrspace(1) %arg
3481+
ret void
3482+
}
3483+
3484+
define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) %arg) #0 {
3485+
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3486+
; NOLIT-SRCC: ; %bb.0:
3487+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3488+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3489+
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3490+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3491+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3492+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3493+
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3494+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3495+
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3496+
; NOLIT-SRCC-NEXT: s_nop 0
3497+
; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3498+
; NOLIT-SRCC-NEXT: s_nop 3
3499+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3500+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3501+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3502+
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3503+
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3504+
; NOLIT-SRCC-NEXT: s_nop 0
3505+
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3506+
; NOLIT-SRCC-NEXT: s_endpgm
3507+
;
3508+
; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3509+
; LIT-SRCC: ; %bb.0:
3510+
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
3511+
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
3512+
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3513+
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
3514+
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
3515+
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
3516+
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
3517+
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
3518+
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
3519+
; LIT-SRCC-NEXT: s_nop 0
3520+
; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
3521+
; LIT-SRCC-NEXT: s_nop 3
3522+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
3523+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
3524+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
3525+
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
3526+
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
3527+
; LIT-SRCC-NEXT: s_nop 0
3528+
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3529+
; LIT-SRCC-NEXT: s_endpgm
3530+
;
3531+
; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3532+
; GFX90A: ; %bb.0:
3533+
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
3534+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
3535+
; GFX90A-NEXT: v_mov_b32_e32 v1, 1
3536+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3537+
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
3538+
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
3539+
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
3540+
; GFX90A-NEXT: v_mov_b32_e32 v2, 2
3541+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
3542+
; GFX90A-NEXT: s_nop 0
3543+
; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3544+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
3545+
; GFX90A-NEXT: s_nop 3
3546+
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3547+
; GFX90A-NEXT: s_endpgm
3548+
;
3549+
; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3550+
; GFX942: ; %bb.0:
3551+
; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
3552+
; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
3553+
; GFX942-NEXT: v_mov_b32_e32 v1, 1
3554+
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3555+
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
3556+
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
3557+
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
3558+
; GFX942-NEXT: v_mov_b32_e32 v2, 2
3559+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
3560+
; GFX942-NEXT: s_nop 0
3561+
; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
3562+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
3563+
; GFX942-NEXT: s_nop 3
3564+
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
3565+
; GFX942-NEXT: s_endpgm
3566+
;
3567+
; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
3568+
; GFX942-VGPR: ; %bb.0:
3569+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1
3570+
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3571+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41
3572+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
3573+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
3574+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
3575+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
3576+
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
3577+
; GFX942-VGPR-NEXT: s_nop 0
3578+
; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
3579+
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
3580+
; GFX942-VGPR-NEXT: s_nop 3
3581+
; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3582+
; GFX942-VGPR-NEXT: s_endpgm
3583+
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
3584+
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
3585+
store <4 x i32> %mai.1, ptr addrspace(1) %arg
3586+
ret void
3587+
}
3588+
32423589
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
32433590
; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
32443591
; NOLIT-SRCC: ; %bb.0: ; %bb

0 commit comments

Comments
 (0)