diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 3808a26a0b92a..a5e1522753c8b 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -50,7 +50,7 @@ add_llvm_target(PowerPCCodeGen PPCTargetTransformInfo.cpp PPCTOCRegDeps.cpp PPCTLSDynamicCall.cpp - PPCVSXCopy.cpp + PPCVSXWACCCopy.cpp PPCReduceCRLogicals.cpp PPCVSXFMAMutate.cpp PPCVSXSwapRemoval.cpp diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 124dac4584312..a8f0f215ebee5 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -39,7 +39,7 @@ class ModulePass; FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM); FunctionPass *createPPCTOCRegDepsPass(); FunctionPass *createPPCEarlyReturnPass(); - FunctionPass *createPPCVSXCopyPass(); + FunctionPass *createPPCVSXWACCCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCReduceCRLogicalsPass(); @@ -64,7 +64,7 @@ class ModulePass; void initializePPCLoopInstrFormPrepPass(PassRegistry&); void initializePPCTOCRegDepsPass(PassRegistry&); void initializePPCEarlyReturnPass(PassRegistry&); - void initializePPCVSXCopyPass(PassRegistry&); + void initializePPCVSXWACCCopyPass(PassRegistry&); void initializePPCVSXFMAMutatePass(PassRegistry&); void initializePPCVSXSwapRemovalPass(PassRegistry&); void initializePPCReduceCRLogicalsPass(PassRegistry&); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 7c1550e99bae1..7cb7e05b55ca0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" @@ -1863,6 +1864,48 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcRegSub1) .addReg(SrcRegSub1, getKillRegState(KillSrc)); return; + } else if ((PPC::WACCRCRegClass.contains(DestReg) || + PPC::WACC_HIRCRegClass.contains(DestReg)) && + (PPC::WACCRCRegClass.contains(SrcReg) || + PPC::WACC_HIRCRegClass.contains(SrcReg))) { + + Opc = PPC::WACCRCRegClass.contains(SrcReg) ? PPC::DMXXEXTFDMR512 + : PPC::DMXXEXTFDMR512_HI; + + RegScavenger RS; + RS.enterBasicBlockEnd(MBB); + RS.backward(std::next(I)); + + Register TmpReg1 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I, + /* RestoreAfter */ false, 0, + /* AllowSpill */ false); + + RS.setRegUsed(TmpReg1); + Register TmpReg2 = RS.scavengeRegisterBackwards(PPC::VSRpRCRegClass, I, + /* RestoreAfter */ false, 0, + /* AllowSpill */ false); + + BuildMI(MBB, I, DL, get(Opc)) + .addReg(TmpReg1, RegState::Define) + .addReg(TmpReg2, RegState::Define) + .addReg(SrcReg, getKillRegState(KillSrc)); + + Opc = PPC::WACCRCRegClass.contains(DestReg) ? PPC::DMXXINSTDMR512 + : PPC::DMXXINSTDMR512_HI; + + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addReg(TmpReg1, RegState::Kill) + .addReg(TmpReg2, RegState::Kill); + + return; + } else if (PPC::DMRRCRegClass.contains(DestReg) && + PPC::DMRRCRegClass.contains(SrcReg)) { + + BuildMI(MBB, I, DL, get(PPC::DMMR), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + + return; + } else llvm_unreachable("Impossible reg-to-reg copy"); diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index b5c6ac111dff0..ae92d5eab20cd 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -129,7 +129,7 @@ LLVMInitializePowerPCTarget() { initializePPCLoopInstrFormPrepPass(PR); initializePPCTOCRegDepsPass(PR); initializePPCEarlyReturnPass(PR); - initializePPCVSXCopyPass(PR); + initializePPCVSXWACCCopyPass(PR); initializePPCVSXFMAMutatePass(PR); initializePPCVSXSwapRemovalPass(PR); initializePPCReduceCRLogicalsPass(PR); @@ -528,7 +528,7 @@ bool PPCPassConfig::addInstSelector() { addPass(createPPCCTRLoopsVerify()); #endif - addPass(createPPCVSXCopyPass()); + addPass(createPPCVSXWACCCopyPass()); return false; } diff --git a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp similarity index 76% rename from llvm/lib/Target/PowerPC/PPCVSXCopy.cpp rename to llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp index 794095cd43769..044c945fc2049 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXWACCCopy.cpp @@ -1,4 +1,4 @@ -//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===// +//===-------------- PPCVSXWACCCopy.cpp - VSX and WACC Copy Legalization ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,7 +8,7 @@ // // A pass which deals with the complexity of generating legal VSX register // copies to/from register classes which partially overlap with the VSX -// register file. +// register file and combines the wacc/wacc_hi copies when needed. // //===----------------------------------------------------------------------===// @@ -29,12 +29,12 @@ using namespace llvm; #define DEBUG_TYPE "ppc-vsx-copy" namespace { - // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers + // PPCVSXWACCCopy pass - For copies between VSX registers and non-VSX registers // (Altivec and scalar floating-point registers), we need to transform the // copies into subregister copies with other restrictions. - struct PPCVSXCopy : public MachineFunctionPass { + struct PPCVSXWACCCopy : public MachineFunctionPass { static char ID; - PPCVSXCopy() : MachineFunctionPass(ID) {} + PPCVSXWACCCopy() : MachineFunctionPass(ID) {} const TargetInstrInfo *TII; @@ -122,6 +122,33 @@ namespace { // Transform the original copy into a subregister extraction copy. SrcMO.setReg(NewVReg); SrcMO.setSubReg(PPC::sub_64); + } else if (IsRegInClass(DstMO.getReg(), &PPC::WACC_HIRCRegClass, MRI) && + IsRegInClass(SrcMO.getReg(), &PPC::WACCRCRegClass, MRI)) { + // Matches the pattern: + // %a:waccrc = COPY %b.sub_wacc_hi:dmrrc + // %c:wacc_hirc = COPY %a:waccrc + // And replaces it with: + // %c:wacc_hirc = COPY %b.sub_wacc_hi:dmrrc + MachineInstr *DefMI = MRI.getUniqueVRegDef(SrcMO.getReg()); + if (!DefMI || !DefMI->isCopy()) + continue; + + MachineOperand &OrigSrc = DefMI->getOperand(1); + + if (!IsRegInClass(OrigSrc.getReg(), &PPC::DMRRCRegClass, MRI)) + continue; + + if (OrigSrc.getSubReg() != PPC::sub_wacc_hi) + continue; + + // Rewrite the second copy to use the original register's subreg + SrcMO.setReg(OrigSrc.getReg()); + SrcMO.setSubReg(PPC::sub_wacc_hi); + Changed = true; + + // Remove the intermediate copy if safe + if (MRI.use_nodbg_empty(DefMI->getOperand(0).getReg())) + DefMI->eraseFromParent(); } } @@ -151,9 +178,9 @@ namespace { }; } // end anonymous namespace -INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE, +INITIALIZE_PASS(PPCVSXWACCCopy, DEBUG_TYPE, "PowerPC VSX Copy Legalization", false, false) -char PPCVSXCopy::ID = 0; +char PPCVSXWACCCopy::ID = 0; FunctionPass* -llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); } +llvm::createPPCVSXWACCCopyPass() { return new PPCVSXWACCCopy(); } diff --git a/llvm/test/CodeGen/PowerPC/dmr-copy.ll b/llvm/test/CodeGen/PowerPC/dmr-copy.ll new file mode 100644 index 0000000000000..d5a24309f94d5 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/dmr-copy.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +define void @test_wacc_copy(ptr noundef %vdmrp, ptr noundef %vpp, <16 x i8> noundef %vc, ptr noundef %resp) #0 { +; CHECK-LABEL: test_wacc_copy: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: std r31, -8(r1) +; CHECK-NEXT: std r30, -16(r1) +; CHECK-NEXT: mr r30, r1 +; CHECK-NEXT: clrldi r0, r1, 57 +; CHECK-NEXT: subfic r0, r0, -384 +; CHECK-NEXT: stdux r1, r1, r0 +; CHECK-NEXT: .cfi_def_cfa_register r30 +; CHECK-NEXT: .cfi_offset r31, -8 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: mr r31, r1 +; CHECK-NEXT: std r3, 360(r31) +; CHECK-NEXT: std r4, 352(r31) +; CHECK-NEXT: stxv v2, 336(r31) +; CHECK-NEXT: std r7, 328(r31) +; CHECK-NEXT: ld r3, 360(r31) +; CHECK-NEXT: lxvp vsp34, 0(r3) +; CHECK-NEXT: lxvp vsp36, 32(r3) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: lxvp vsp34, 64(r3) +; CHECK-NEXT: lxvp vsp36, 96(r3) +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 224(r31) +; CHECK-NEXT: stxvp vsp36, 192(r31) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 160(r31) +; CHECK-NEXT: stxvp vsp36, 128(r31) +; CHECK-NEXT: ld r3, 352(r31) +; CHECK-NEXT: lxv v2, 16(r3) +; CHECK-NEXT: lxv v3, 0(r3) +; CHECK-NEXT: stxv v2, 112(r31) +; CHECK-NEXT: stxv v3, 96(r31) +; CHECK-NEXT: lxv v2, 112(r31) +; CHECK-NEXT: lxv v3, 96(r31) +; CHECK-NEXT: lxv vs0, 336(r31) +; CHECK-NEXT: dmxvi8gerx4 dmr0, vsp34, vs0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 224(r31) +; CHECK-NEXT: stxvp vsp36, 192(r31) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 160(r31) +; CHECK-NEXT: stxvp vsp36, 128(r31) +; CHECK-NEXT: lxvp vsp34, 128(r31) +; CHECK-NEXT: lxvp vsp36, 160(r31) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: lxvp vsp34, 192(r31) +; CHECK-NEXT: lxvp vsp36, 224(r31) +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: ld r3, 328(r31) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r3) +; CHECK-NEXT: stxvp vsp36, 64(r3) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r3) +; CHECK-NEXT: stxvp vsp36, 0(r3) +; CHECK-NEXT: mr r1, r30 +; CHECK-NEXT: ld r31, -8(r1) +; CHECK-NEXT: ld r30, -16(r1) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_wacc_copy: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: std r31, -8(r1) +; CHECK-BE-NEXT: std r30, -16(r1) +; CHECK-BE-NEXT: mr r30, r1 +; CHECK-BE-NEXT: clrldi r0, r1, 57 +; CHECK-BE-NEXT: subfic r0, r0, -384 +; CHECK-BE-NEXT: stdux r1, r1, r0 +; CHECK-BE-NEXT: mr r31, r1 +; CHECK-BE-NEXT: std r3, 360(r31) +; CHECK-BE-NEXT: std r4, 352(r31) +; CHECK-BE-NEXT: stxv v2, 336(r31) +; CHECK-BE-NEXT: std r5, 328(r31) +; CHECK-BE-NEXT: ld r3, 360(r31) +; CHECK-BE-NEXT: lxvp vsp34, 96(r3) +; CHECK-BE-NEXT: lxvp vsp36, 64(r3) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: lxvp vsp34, 32(r3) +; CHECK-BE-NEXT: lxvp vsp36, 0(r3) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 224(r31) +; CHECK-BE-NEXT: stxvp vsp34, 192(r31) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 160(r31) +; CHECK-BE-NEXT: stxvp vsp34, 128(r31) +; CHECK-BE-NEXT: ld r3, 352(r31) +; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: lxv v3, 16(r3) +; CHECK-BE-NEXT: stxv v3, 112(r31) +; CHECK-BE-NEXT: stxv v2, 96(r31) +; CHECK-BE-NEXT: lxv v2, 96(r31) +; CHECK-BE-NEXT: lxv v3, 112(r31) +; CHECK-BE-NEXT: lxv vs0, 336(r31) +; CHECK-BE-NEXT: dmxvi8gerx4 dmr0, vsp34, vs0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 224(r31) +; CHECK-BE-NEXT: stxvp vsp34, 192(r31) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 160(r31) +; CHECK-BE-NEXT: stxvp vsp34, 128(r31) +; CHECK-BE-NEXT: lxvp vsp34, 224(r31) +; CHECK-BE-NEXT: lxvp vsp36, 192(r31) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: lxvp vsp34, 160(r31) +; CHECK-BE-NEXT: lxvp vsp36, 128(r31) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: ld r3, 328(r31) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r3) +; CHECK-BE-NEXT: stxvp vsp34, 64(r3) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r3) +; CHECK-BE-NEXT: stxvp vsp34, 0(r3) +; CHECK-BE-NEXT: mr r1, r30 +; CHECK-BE-NEXT: ld r31, -8(r1) +; CHECK-BE-NEXT: ld r30, -16(r1) +; CHECK-BE-NEXT: blr +entry: + %vdmrp.addr = alloca ptr, align 8 + %vpp.addr = alloca ptr, align 8 + %vc.addr = alloca <16 x i8>, align 16 + %resp.addr = alloca ptr, align 8 + %vdmr = alloca <1024 x i1>, align 128 + %vp = alloca <256 x i1>, align 32 + store ptr %vdmrp, ptr %vdmrp.addr, align 8 + store ptr %vpp, ptr %vpp.addr, align 8 + store <16 x i8> %vc, ptr %vc.addr, align 16 + store ptr %resp, ptr %resp.addr, align 8 + %0 = load ptr, ptr %vdmrp.addr, align 8 + %1 = load <1024 x i1>, ptr %0, align 128 + store <1024 x i1> %1, ptr %vdmr, align 128 + %2 = load ptr, ptr %vpp.addr, align 8 + %3 = load <256 x i1>, ptr %2, align 32 + store <256 x i1> %3, ptr %vp, align 32 + %4 = load <256 x i1>, ptr %vp, align 32 + %5 = load <16 x i8>, ptr %vc.addr, align 16 + %6 = call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> %4, <16 x i8> %5) + store <1024 x i1> %6, ptr %vdmr, align 128 + %7 = load <1024 x i1>, ptr %vdmr, align 128 + %8 = load ptr, ptr %resp.addr, align 8 + store <1024 x i1> %7, ptr %8, align 128 + ret void +} + +define void @foo(ptr noundef readonly captures(none) %p1, ptr noundef readonly captures(none) %p2, ptr noundef writeonly captures(none) initializes((0, 128)) %res1, ptr noundef writeonly captures(none) initializes((0, 128)) %res2) local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: dmsetdmrz dmr0 +; CHECK-NEXT: lxvp vsp34, 0(r3) +; CHECK-NEXT: lxvp vsp36, 32(r3) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-NEXT: lxvp vsp34, 64(r3) +; CHECK-NEXT: lxvp vsp36, 96(r3) +; CHECK-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-NEXT: dmmr dmr2, dmr0 +; CHECK-NEXT: dmxor dmr2, dmr1 +; CHECK-NEXT: lxvp vsp34, 0(r4) +; CHECK-NEXT: lxvp vsp36, 32(r4) +; CHECK-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-NEXT: lxvp vsp34, 64(r4) +; CHECK-NEXT: lxvp vsp36, 96(r4) +; CHECK-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-NEXT: dmxor dmr0, dmr1 +; CHECK-NEXT: dmmr dmr1, dmr2 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-NEXT: stxvp vsp34, 96(r5) +; CHECK-NEXT: stxvp vsp36, 64(r5) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi1, 1 +; CHECK-NEXT: stxvp vsp34, 32(r5) +; CHECK-NEXT: stxvp vsp36, 0(r5) +; CHECK-NEXT: dmmr dmr0, dmr0 +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-NEXT: stxvp vsp34, 96(r6) +; CHECK-NEXT: stxvp vsp36, 64(r6) +; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-NEXT: stxvp vsp34, 32(r6) +; CHECK-NEXT: stxvp vsp36, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: foo: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: dmsetdmrz dmr0 +; CHECK-BE-NEXT: lxvp vsp34, 96(r3) +; CHECK-BE-NEXT: lxvp vsp36, 64(r3) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-BE-NEXT: lxvp vsp34, 32(r3) +; CHECK-BE-NEXT: lxvp vsp36, 0(r3) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmmr dmr2, dmr0 +; CHECK-BE-NEXT: dmxor dmr2, dmr1 +; CHECK-BE-NEXT: lxvp vsp34, 96(r4) +; CHECK-BE-NEXT: lxvp vsp36, 64(r4) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-BE-NEXT: lxvp vsp34, 32(r4) +; CHECK-BE-NEXT: lxvp vsp36, 0(r4) +; CHECK-BE-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxor dmr0, dmr1 +; CHECK-BE-NEXT: dmmr dmr1, dmr2 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi1, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r5) +; CHECK-BE-NEXT: stxvp vsp34, 64(r5) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r5) +; CHECK-BE-NEXT: stxvp vsp34, 0(r5) +; CHECK-BE-NEXT: dmmr dmr0, dmr0 +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 +; CHECK-BE-NEXT: stxvp vsp36, 96(r6) +; CHECK-BE-NEXT: stxvp vsp34, 64(r6) +; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-NEXT: stxvp vsp36, 32(r6) +; CHECK-BE-NEXT: stxvp vsp34, 0(r6) +; CHECK-BE-NEXT: blr +entry: + %0 = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz() + %1 = load <1024 x i1>, ptr %p1, align 128 + %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> %0, <1024 x i1> %1) + %3 = load <1024 x i1>, ptr %p2, align 128 + %4 = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> %0, <1024 x i1> %3) + %5 = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> %2) + store <1024 x i1> %5, ptr %res1, align 128 + %6 = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> %4) + store <1024 x i1> %6, ptr %res2, align 128 + ret void +} + +declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz() +declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>) +declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>) +declare <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1>, <16 x i8>) + +attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="future" "target-features"="+64bit,+allow-unaligned-fp-access,+altivec,+bpermd,+cmpb,+crbits,+crypto,+direct-move,+extdiv,+fast-MFLR,+fcpsgn,+fpcvt,+fprnd,+fpu,+fre,+fres,+frsqrte,+frsqrtes,+fsqrt,+fuse-add-logical,+fuse-arith-add,+fuse-logical,+fuse-logical-add,+fuse-sha3,+fuse-store,+fusion,+hard-float,+icbt,+isa-future-instructions,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+isa-v31-instructions,+isel,+ldbrx,+lfiwax,+mfocrf,+mma,+paired-vector-memops,+partword-atomics,+pcrelative-memops,+popcntd,+power10-vector,+power8-altivec,+power8-vector,+power9-altivec,+power9-vector,+ppc-postra-sched,+ppc-prera-sched,+predictable-select-expensive,+prefix-instrs,+quadword-atomics,+recipprec,+stfiwx,+two-const-nr,+vsx" } + + diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn index ea3615cee392a..8ab54156a8af2 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn @@ -93,7 +93,7 @@ static_library("LLVMPowerPCCodeGen") { "PPCTargetMachine.cpp", "PPCTargetObjectFile.cpp", "PPCTargetTransformInfo.cpp", - "PPCVSXCopy.cpp", + "PPCVSXWACCCopy.cpp", "PPCVSXFMAMutate.cpp", "PPCVSXSwapRemoval.cpp", ]