Skip to content

[AMDGPU] Extending wave reduction intrinsics for i64 types - 2 #151309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: users/easyonaadit/amdgpu/wave-reduce-intrinsics-i64
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 123 additions & 18 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
Expand Down Expand Up @@ -5158,51 +5160,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
Register NumActiveLanes =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
unsigned BitCountOpc =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;

auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);

auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
.addReg(Exec->getOperand(0).getReg());
auto NewAccumulator =
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
.addReg(ExecMask);

switch (Opc) {
case AMDGPU::S_XOR_B32: {
// Performing an XOR operation on a uniform value
// depends on the parity of the number of active lanes.
// For even parity, the result will be 0, for odd
// parity the result will be the same as the input value.
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
Register ParityRegister =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1)
.setOperandDead(3); // Dead scc
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg());
.addReg(ParityRegister);
break;
}
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);

// Take the negation of the source operand.
auto InvertedValReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
.addImm(-1)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
.addImm(0)
.addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
Expand All @@ -5212,6 +5217,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is duplicating the base handling for these pseudos, and also is ignoring targets that do have 64-bit scalar add

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried WAVE_RED_ADD_PSEUDO -> S_ADD_U64_PSEUDO pipeline, but I couldn't find a way to expand the S_ADD_U64_PSEUDO beyond that. I'm not sure if I can replace one pseudo with another in the ExpandPseudo pass.
I have removed some operations from the base handling which would be redundant for my use, and I've used 32-bit Opcodes, so this works for all targets. I could add a check to use 64-bit scalar add for the targets which do have it.

Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register Op1H_Op0L_Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register Op1L_Op0H_Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register NegatedValLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register NegatedValHi =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *Src1SubRC =
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);

MachineOperand Op1L = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);

if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
.addImm(0)
.addReg(NewAccumulator->getOperand(0).getReg());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this also have dead setcc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yupp, modified.

BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
.addReg(NegatedValLo)
.addImm(31)
.setOperandDead(3); // Dead scc
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
.add(Op1L)
.addReg(NegatedValHi);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dead setcc?

}
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
? NegatedValLo
: NewAccumulator->getOperand(0).getReg();
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
.add(Op1L)
.addReg(LowOpcode);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
.add(Op1L)
.addReg(LowOpcode);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
.add(Op1H)
.addReg(LowOpcode);

Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
.addReg(CarryReg)
.addReg(Op1H_Op0L_Reg)
.setOperandDead(3); // Dead scc

if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
.addReg(HiVal)
.addReg(Op1L_Op0H_Reg)
.setOperandDead(3); // Dead scc
}
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
break;
}
}
RetBB = &BB;
}
Expand Down Expand Up @@ -5377,6 +5450,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(Accumulator->getOperand(0).getReg());
break;
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Capitalize

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Donee.

: AMDGPU::S_SUB_U32;
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
&AMDGPU::SReg_32RegClass);
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
&AMDGPU::SReg_32RegClass);
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
.add(Accumlo)
.addReg(LaneValueLo->getOperand(0).getReg());
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
.add(Accumhi)
.addReg(LaneValueHi->getOperand(0).getReg());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dead setcc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yuppp, modified.

NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(DestLo)
.addImm(AMDGPU::sub0)
.addReg(DestHi)
.addImm(AMDGPU::sub1);
break;
}
}
}
// Manipulate the iterator to get the next active lane
Expand Down Expand Up @@ -5432,8 +5533,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ defvar Operations = [
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
];

foreach Op = Operations in {
Expand Down
Loading
Loading