|
20 | 20 | #include "Utils/AArch64BaseInfo.h"
|
21 | 21 | #include "llvm/ADT/ArrayRef.h"
|
22 | 22 | #include "llvm/ADT/STLExtras.h"
|
| 23 | +#include "llvm/ADT/SmallSet.h" |
23 | 24 | #include "llvm/ADT/SmallVector.h"
|
24 | 25 | #include "llvm/CodeGen/CFIInstBuilder.h"
|
25 | 26 | #include "llvm/CodeGen/LivePhysRegs.h"
|
|
35 | 36 | #include "llvm/CodeGen/MachineRegisterInfo.h"
|
36 | 37 | #include "llvm/CodeGen/RegisterScavenging.h"
|
37 | 38 | #include "llvm/CodeGen/StackMaps.h"
|
| 39 | +#include "llvm/CodeGen/TargetOpcodes.h" |
38 | 40 | #include "llvm/CodeGen/TargetRegisterInfo.h"
|
39 | 41 | #include "llvm/CodeGen/TargetSubtargetInfo.h"
|
40 | 42 | #include "llvm/IR/DebugInfoMetadata.h"
|
@@ -7317,6 +7319,57 @@ static bool getMiscPatterns(MachineInstr &Root,
|
7317 | 7319 | return false;
|
7318 | 7320 | }
|
7319 | 7321 |
|
| 7322 | +/// Search for patterns where we use LD1i32 instructions to load into |
| 7323 | +/// 4 separate lanes of a 128 bit Neon register. We can increase ILP |
| 7324 | +/// by loading into 2 Neon registers instead. |
| 7325 | +static bool getLoadPatterns(MachineInstr &Root, |
| 7326 | + SmallVectorImpl<unsigned> &Patterns) { |
| 7327 | + const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo(); |
| 7328 | + const TargetRegisterInfo *TRI = |
| 7329 | + Root.getMF()->getSubtarget().getRegisterInfo(); |
| 7330 | + // Enable this only on Darwin targets, where it should be profitable. Other |
| 7331 | + // targets can remove this check if it is profitable there as well. |
| 7332 | + if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin()) |
| 7333 | + return false; |
| 7334 | + |
| 7335 | + // The pattern searches for loads into single lanes. |
| 7336 | + if (Root.getOpcode() != AArch64::LD1i32) |
| 7337 | + return false; |
| 7338 | + |
| 7339 | + // The root of the pattern must load into the last lane of the vector. |
| 7340 | + if (Root.getOperand(2).getImm() != 3) |
| 7341 | + return false; |
| 7342 | + |
| 7343 | + // Check that we have load into all lanes except lane 0. |
| 7344 | + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); |
| 7345 | + SmallSet<unsigned, 4> RemainingLanes({1, 2}); |
| 7346 | + while (RemainingLanes.begin() != RemainingLanes.end() && |
| 7347 | + CurrInstr->getOpcode() == AArch64::LD1i32 && |
| 7348 | + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) { |
| 7349 | + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); |
| 7350 | + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); |
| 7351 | + } |
| 7352 | + |
| 7353 | + if (!RemainingLanes.empty()) |
| 7354 | + return false; |
| 7355 | + |
| 7356 | + // Match the SUBREG_TO_REG sequence. |
| 7357 | + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) |
| 7358 | + return false; |
| 7359 | + |
| 7360 | + // Verify that the subreg to reg loads an i32 into the first lane. |
| 7361 | + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); |
| 7362 | + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32) |
| 7363 | + return false; |
| 7364 | + |
| 7365 | + // Verify that it also has a single non debug use. |
| 7366 | + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) |
| 7367 | + return false; |
| 7368 | + |
| 7369 | + Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD); |
| 7370 | + return true; |
| 7371 | +} |
| 7372 | + |
7320 | 7373 | CombinerObjective
|
7321 | 7374 | AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
|
7322 | 7375 | switch (Pattern) {
|
@@ -7351,6 +7404,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
|
7351 | 7404 | if (getMiscPatterns(Root, Patterns))
|
7352 | 7405 | return true;
|
7353 | 7406 |
|
| 7407 | + // Load patterns |
| 7408 | + if (getLoadPatterns(Root, Patterns)) |
| 7409 | + return true; |
| 7410 | + |
7354 | 7411 | return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
|
7355 | 7412 | DoRegPressureReduce);
|
7356 | 7413 | }
|
@@ -8681,6 +8738,66 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
|
8681 | 8738 | MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
|
8682 | 8739 | break;
|
8683 | 8740 | }
|
| 8741 | + case AArch64MachineCombinerPattern::SPLIT_LD: { |
| 8742 | + // Gather the initial load instructions, we will use them later to build the |
| 8743 | + // pattern. |
| 8744 | + MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); |
| 8745 | + MachineInstr *Lane1Load = |
| 8746 | + MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg()); |
| 8747 | + MachineInstr *SubregToReg = |
| 8748 | + MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg()); |
| 8749 | + const TargetRegisterClass *FPR128RegClass = |
| 8750 | + MRI.getRegClass(Root.getOperand(0).getReg()); |
| 8751 | + |
| 8752 | + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, |
| 8753 | + Register SrcRegister, unsigned Lane, |
| 8754 | + Register OffsetRegister) { |
| 8755 | + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); |
| 8756 | + MachineInstrBuilder LoadIndexIntoRegister = |
| 8757 | + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), |
| 8758 | + NewRegister) |
| 8759 | + .addReg(SrcRegister) |
| 8760 | + .addImm(Lane) |
| 8761 | + .addReg(OffsetRegister, getKillRegState(true)); |
| 8762 | + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); |
| 8763 | + InsInstrs.push_back(LoadIndexIntoRegister); |
| 8764 | + return NewRegister; |
| 8765 | + }; |
| 8766 | + |
| 8767 | + // To rewrite the pattern, we first need define a new register to |
| 8768 | + // load our results into. |
| 8769 | + auto ImplicitDefForReg1 = MRI.createVirtualRegister(FPR128RegClass); |
| 8770 | + auto DefInstr = |
| 8771 | + BuildMI(MF, MIMetadata(Root), TII->get(TargetOpcode::IMPLICIT_DEF), |
| 8772 | + ImplicitDefForReg1); |
| 8773 | + InstrIdxForVirtReg.insert( |
| 8774 | + std::make_pair(ImplicitDefForReg1, InsInstrs.size())); |
| 8775 | + InsInstrs.push_back(DefInstr); |
| 8776 | + |
| 8777 | + // Load index 1 into register 1 lane 0. |
| 8778 | + Register Index1LoadReg = LoadLaneToRegister( |
| 8779 | + Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg()); |
| 8780 | + DelInstrs.push_back(Lane1Load); |
| 8781 | + |
| 8782 | + // Load index 2 into register 0 lane 1. |
| 8783 | + auto Index2LoadReg = |
| 8784 | + LoadLaneToRegister(Lane2Load, SubregToReg->getOperand(0).getReg(), 1, |
| 8785 | + Lane2Load->getOperand(3).getReg()); |
| 8786 | + DelInstrs.push_back(Lane2Load); |
| 8787 | + |
| 8788 | + // Load index 3 into register 1 lane 1. |
| 8789 | + auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1, |
| 8790 | + Root.getOperand(3).getReg()); |
| 8791 | + // Root will be deleted after this pattern is applied |
| 8792 | + |
| 8793 | + // Create the zip instruction. |
| 8794 | + MachineInstrBuilder ZipInstr = |
| 8795 | + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), |
| 8796 | + Root.getOperand(0).getReg()) |
| 8797 | + .addReg(Index2LoadReg) |
| 8798 | + .addReg(Index3LoadReg); |
| 8799 | + InsInstrs.push_back(ZipInstr); |
| 8800 | + } |
8684 | 8801 |
|
8685 | 8802 | } // end switch (Pattern)
|
8686 | 8803 | // Record MUL and ADD/SUB for deletion
|
|
0 commit comments