-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AMDGPU] Extending wave reduction intrinsics for i64
types - 2
#151309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/easyonaadit/amdgpu/wave-reduce-intrinsics-i64
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { | |
case AMDGPU::V_CMP_GT_I64_e64: // max.i64 | ||
return std::numeric_limits<int32_t>::min(); | ||
case AMDGPU::S_ADD_I32: | ||
case AMDGPU::S_ADD_U64_PSEUDO: | ||
case AMDGPU::S_SUB_I32: | ||
case AMDGPU::S_SUB_U64_PSEUDO: | ||
case AMDGPU::S_OR_B32: | ||
case AMDGPU::S_XOR_B32: | ||
return std::numeric_limits<uint32_t>::min(); | ||
|
@@ -5158,51 +5160,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, | |
} | ||
case AMDGPU::S_XOR_B32: | ||
case AMDGPU::S_ADD_I32: | ||
case AMDGPU::S_SUB_I32: { | ||
case AMDGPU::S_ADD_U64_PSEUDO: | ||
case AMDGPU::S_SUB_I32: | ||
case AMDGPU::S_SUB_U64_PSEUDO: { | ||
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); | ||
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); | ||
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); | ||
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); | ||
Register NumActiveLanes = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
|
||
bool IsWave32 = ST.isWave32(); | ||
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | ||
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | ||
unsigned CountReg = | ||
unsigned BitCountOpc = | ||
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; | ||
|
||
auto Exec = | ||
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); | ||
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); | ||
|
||
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) | ||
.addReg(Exec->getOperand(0).getReg()); | ||
auto NewAccumulator = | ||
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes) | ||
.addReg(ExecMask); | ||
|
||
switch (Opc) { | ||
case AMDGPU::S_XOR_B32: { | ||
// Performing an XOR operation on a uniform value | ||
// depends on the parity of the number of active lanes. | ||
// For even parity, the result will be 0, for odd | ||
// parity the result will be the same as the input value. | ||
Register ParityRegister = MRI.createVirtualRegister(DstRegClass); | ||
Register ParityRegister = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
|
||
auto ParityReg = | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) | ||
.addReg(NewAccumulator->getOperand(0).getReg()) | ||
.addImm(1); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) | ||
.addReg(NewAccumulator->getOperand(0).getReg()) | ||
.addImm(1) | ||
.setOperandDead(3); // Dead scc | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) | ||
.addReg(SrcReg) | ||
.addReg(ParityReg->getOperand(0).getReg()); | ||
.addReg(ParityRegister); | ||
break; | ||
} | ||
case AMDGPU::S_SUB_I32: { | ||
Register NegatedVal = MRI.createVirtualRegister(DstRegClass); | ||
|
||
// Take the negation of the source operand. | ||
auto InvertedValReg = | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal) | ||
.addImm(-1) | ||
.addReg(SrcReg); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal) | ||
.addImm(0) | ||
.addReg(SrcReg); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) | ||
.addReg(InvertedValReg->getOperand(0).getReg()) | ||
.addReg(NegatedVal) | ||
.addReg(NewAccumulator->getOperand(0).getReg()); | ||
break; | ||
} | ||
|
@@ -5212,6 +5217,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, | |
.addReg(NewAccumulator->getOperand(0).getReg()); | ||
break; | ||
} | ||
case AMDGPU::S_ADD_U64_PSEUDO: | ||
case AMDGPU::S_SUB_U64_PSEUDO: { | ||
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register Op1H_Op0L_Reg = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register Op1L_Op0H_Reg = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register NegatedValLo = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register NegatedValHi = | ||
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
|
||
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg); | ||
const TargetRegisterClass *Src1SubRC = | ||
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0); | ||
|
||
MachineOperand Op1L = TII->buildExtractSubRegOrImm( | ||
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC); | ||
MachineOperand Op1H = TII->buildExtractSubRegOrImm( | ||
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC); | ||
|
||
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) { | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo) | ||
.addImm(0) | ||
.addReg(NewAccumulator->getOperand(0).getReg()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this also have dead setcc? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yupp, modified. |
||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi) | ||
.addReg(NegatedValLo) | ||
.addImm(31) | ||
.setOperandDead(3); // Dead scc | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg) | ||
.add(Op1L) | ||
.addReg(NegatedValHi); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dead setcc? |
||
} | ||
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO | ||
? NegatedValLo | ||
: NewAccumulator->getOperand(0).getReg(); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0) | ||
jmmartinez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.add(Op1L) | ||
.addReg(LowOpcode); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg) | ||
.add(Op1L) | ||
.addReg(LowOpcode); | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg) | ||
.add(Op1H) | ||
.addReg(LowOpcode); | ||
|
||
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1; | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal) | ||
.addReg(CarryReg) | ||
.addReg(Op1H_Op0L_Reg) | ||
.setOperandDead(3); // Dead scc | ||
|
||
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) { | ||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1) | ||
.addReg(HiVal) | ||
.addReg(Op1L_Op0H_Reg) | ||
.setOperandDead(3); // Dead scc | ||
} | ||
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg) | ||
.addReg(DestSub0) | ||
.addImm(AMDGPU::sub0) | ||
.addReg(DestSub1) | ||
.addImm(AMDGPU::sub1); | ||
break; | ||
} | ||
} | ||
RetBB = &BB; | ||
} | ||
|
@@ -5377,6 +5450,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, | |
.addReg(Accumulator->getOperand(0).getReg()); | ||
break; | ||
} | ||
case AMDGPU::S_ADD_U64_PSEUDO: | ||
case AMDGPU::S_SUB_U64_PSEUDO: { | ||
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Capitalize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Donee. |
||
: AMDGPU::S_SUB_U32; | ||
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32 | ||
: AMDGPU::S_SUBB_U32; | ||
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | ||
MachineOperand Accumlo = TII->buildExtractSubRegOrImm( | ||
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0, | ||
&AMDGPU::SReg_32RegClass); | ||
MachineOperand Accumhi = TII->buildExtractSubRegOrImm( | ||
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1, | ||
&AMDGPU::SReg_32RegClass); | ||
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo) | ||
.add(Accumlo) | ||
.addReg(LaneValueLo->getOperand(0).getReg()); | ||
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi) | ||
.add(Accumhi) | ||
.addReg(LaneValueHi->getOperand(0).getReg()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dead setcc? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yuppp, modified. |
||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, | ||
TII->get(TargetOpcode::REG_SEQUENCE), DstReg) | ||
.addReg(DestLo) | ||
.addImm(AMDGPU::sub0) | ||
.addReg(DestHi) | ||
.addImm(AMDGPU::sub1); | ||
break; | ||
} | ||
} | ||
} | ||
// Manipulate the iterator to get the next active lane | ||
|
@@ -5432,8 +5533,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, | |
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64); | ||
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: | ||
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); | ||
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64: | ||
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO); | ||
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: | ||
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); | ||
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64: | ||
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO); | ||
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: | ||
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); | ||
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is duplicating the base handling for these pseudos, and also is ignoring targets that do have 64-bit scalar add
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried
WAVE_RED_ADD_PSEUDO
->S_ADD_U64_PSEUDO
pipeline, but I couldn't find a way to expand theS_ADD_U64_PSEUDO
beyond that. I'm not sure if I can replace one pseudo with another in the ExpandPseudo pass.I have removed some operations from the base handling which would be redundant for my use, and I've used 32-bit Opcodes, so this works for all targets. I could add a check to use 64-bit scalar add for the targets which do have it.