Skip to content

Commit 49d89bc

Browse files
authored
[AMDGPU] Add gfx1250 cvt_pk|sr_fp8|bf8_f32 instructions (#151595)
1 parent d3a9cde commit 49d89bc

17 files changed

+580
-33
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
707707
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
708708
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
709709
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
710+
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
711+
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
710712
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
711713
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
712714

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,60 @@ void test_prefetch(generic void *fptr, global void *gptr) {
652652
__builtin_amdgcn_global_prefetch(gptr, 8);
653653
}
654654

655+
// CHECK-LABEL: @test_cvt_pk_fp8_f32_e5m3(
656+
// CHECK-NEXT: entry:
657+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
658+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
659+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
660+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
661+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
662+
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
663+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
664+
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
665+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
666+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
667+
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
668+
// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
669+
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
670+
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
671+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
672+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.pk.fp8.f32.e5m3(float [[TMP0]], float [[TMP1]], i32 [[TMP2]], i1 true)
673+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
674+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
675+
// CHECK-NEXT: ret void
676+
//
677+
void test_cvt_pk_fp8_f32_e5m3(global int* out, int old, float a, float b)
678+
{
679+
*out = __builtin_amdgcn_cvt_pk_fp8_f32_e5m3(a, b, old, true);
680+
}
681+
682+
// CHECK-LABEL: @test_cvt_sr_fp8_f32_e5m3(
683+
// CHECK-NEXT: entry:
684+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
685+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
686+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
687+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
688+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
689+
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
690+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
691+
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
692+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
693+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
694+
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
695+
// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
696+
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
697+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
698+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
699+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f32.e5m3(float [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 3)
700+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
701+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
702+
// CHECK-NEXT: ret void
703+
//
704+
void test_cvt_sr_fp8_f32_e5m3(global int* out, int old, float a, int b)
705+
{
706+
*out = __builtin_amdgcn_cvt_sr_fp8_f32_e5m3(a, b, old, 3);
707+
}
708+
655709
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
656710
// CHECK-NEXT: entry:
657711
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3505,6 +3505,12 @@ def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
35053505
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
35063506
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
35073507

3508+
// llvm.amdgcn.cvt.pk.fp8.f32.e5m3 int vdst, float srcA, float srcB, int old, imm word_sel
3509+
def int_amdgcn_cvt_pk_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32_e5m3">,
3510+
DefaultAttrsIntrinsic<[llvm_i32_ty],
3511+
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
3512+
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
3513+
35083514
// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
35093515
// byte_sel selects byte to write into vdst.
35103516
def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
@@ -3518,6 +3524,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
35183524
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
35193525
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
35203526

3527+
// llvm.amdgcn.cvt.sr.fp8.f32.e5m3 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
3528+
def int_amdgcn_cvt_sr_fp8_f32_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32_e5m3">,
3529+
DefaultAttrsIntrinsic<[llvm_i32_ty],
3530+
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3531+
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
3532+
35213533
// llvm.amdgcn.cvt.off.fp32.i4 int srcA
35223534
def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">,
35233535
DefaultAttrsIntrinsic<[llvm_float_ty],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4633,8 +4633,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46334633
case Intrinsic::amdgcn_cvt_pk_f32_fp8:
46344634
case Intrinsic::amdgcn_cvt_pk_f32_bf8:
46354635
case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4636+
case Intrinsic::amdgcn_cvt_pk_fp8_f32_e5m3:
46364637
case Intrinsic::amdgcn_cvt_pk_bf8_f32:
46374638
case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4639+
case Intrinsic::amdgcn_cvt_sr_fp8_f32_e5m3:
46384640
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
46394641
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
46404642
case Intrinsic::amdgcn_cvt_sr_f16_f32:

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9366,17 +9366,17 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
93669366
}
93679367
}
93689368

9369+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
9370+
addOptionalImmOperand(Inst, Operands, OptionalIdx,
9371+
AMDGPUOperand::ImmTyClamp);
9372+
93699373
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
93709374
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
93719375
Inst.addOperand(Inst.getOperand(0));
93729376
addOptionalImmOperand(Inst, Operands, OptionalIdx,
93739377
AMDGPUOperand::ImmTyByteSel);
93749378
}
93759379

9376-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
9377-
addOptionalImmOperand(Inst, Operands, OptionalIdx,
9378-
AMDGPUOperand::ImmTyClamp);
9379-
93809380
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
93819381
addOptionalImmOperand(Inst, Operands, OptionalIdx,
93829382
AMDGPUOperand::ImmTyOModSI);
@@ -9430,6 +9430,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
94309430
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
94319431
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
94329432
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
9433+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp_gfx1250 ||
9434+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx1250_e64_dpp8_gfx1250 ||
94339435
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
94349436
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
94359437
Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 ||
@@ -10038,9 +10040,12 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
1003810040
addOptionalImmOperand(Inst, Operands, OptionalIdx,
1003910041
AMDGPUOperand::ImmTyClamp);
1004010042

10041-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
10043+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
10044+
if (VdstInIdx == static_cast<int>(Inst.getNumOperands()))
10045+
Inst.addOperand(Inst.getOperand(0));
1004210046
addOptionalImmOperand(Inst, Operands, OptionalIdx,
1004310047
AMDGPUOperand::ImmTyByteSel);
10048+
}
1004410049

1004510050
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
1004610051
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);

0 commit comments

Comments
 (0)