Skip to content

Commit ea2546a

Browse files
committed
Apply pattern to basic case of 4 i64 loads into fpr128 register
1 parent e5f3c18 commit ea2546a

File tree

3 files changed

+139
-7
lines changed

3 files changed

+139
-7
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "Utils/AArch64BaseInfo.h"
2121
#include "llvm/ADT/ArrayRef.h"
2222
#include "llvm/ADT/STLExtras.h"
23+
#include "llvm/ADT/SmallSet.h"
2324
#include "llvm/ADT/SmallVector.h"
2425
#include "llvm/CodeGen/CFIInstBuilder.h"
2526
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
3536
#include "llvm/CodeGen/MachineRegisterInfo.h"
3637
#include "llvm/CodeGen/RegisterScavenging.h"
3738
#include "llvm/CodeGen/StackMaps.h"
39+
#include "llvm/CodeGen/TargetOpcodes.h"
3840
#include "llvm/CodeGen/TargetRegisterInfo.h"
3941
#include "llvm/CodeGen/TargetSubtargetInfo.h"
4042
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7317,11 +7319,63 @@ static bool getMiscPatterns(MachineInstr &Root,
73177319
return false;
73187320
}
73197321

7322+
/// Search for patterns where we use LD1i32 instructions to load into
7323+
/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
7324+
/// by loading into 2 Neon registers instead.
7325+
static bool getLoadPatterns(MachineInstr &Root,
7326+
SmallVectorImpl<unsigned> &Patterns) {
7327+
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
7328+
const TargetRegisterInfo *TRI =
7329+
Root.getMF()->getSubtarget().getRegisterInfo();
7330+
// Enable this only on Darwin targets, where it should be profitable. Other
7331+
// targets can remove this check if it is profitable there as well.
7332+
if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
7333+
return false;
7334+
7335+
// The pattern searches for loads into single lanes.
7336+
if (Root.getOpcode() != AArch64::LD1i32)
7337+
return false;
7338+
7339+
// The root of the pattern must load into the last lane of the vector.
7340+
if (Root.getOperand(2).getImm() != 3)
7341+
return false;
7342+
7343+
// Check that we have load into all lanes except lane 0.
7344+
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7345+
SmallSet<unsigned, 4> RemainingLanes({1, 2});
7346+
while (RemainingLanes.begin() != RemainingLanes.end() &&
7347+
Root.getOpcode() == AArch64::LD1i32 &&
7348+
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
7349+
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
7350+
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7351+
}
7352+
7353+
if (!RemainingLanes.empty())
7354+
return false;
7355+
7356+
// Match the SUBREG_TO_REG sequence.
7357+
if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
7358+
return false;
7359+
7360+
// Verify that the subreg to reg loads an i32 into the first lane.
7361+
auto Lane0Load = CurrInstr->getOperand(2).getReg();
7362+
if (TRI->getRegSizeInBits(Lane0Load, MRI) != 32)
7363+
return false;
7364+
7365+
// Verify that it also has a single non debug use.
7366+
if (!MRI.hasOneNonDBGUse(Lane0Load))
7367+
return false;
7368+
7369+
Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
7370+
return true;
7371+
}
7372+
73207373
CombinerObjective
73217374
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
73227375
switch (Pattern) {
73237376
case AArch64MachineCombinerPattern::SUBADD_OP1:
73247377
case AArch64MachineCombinerPattern::SUBADD_OP2:
7378+
case AArch64MachineCombinerPattern::SPLIT_LD:
73257379
return CombinerObjective::MustReduceDepth;
73267380
default:
73277381
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7351,6 +7405,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
73517405
if (getMiscPatterns(Root, Patterns))
73527406
return true;
73537407

7408+
// Load patterns
7409+
if (getLoadPatterns(Root, Patterns))
7410+
return true;
7411+
73547412
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
73557413
DoRegPressureReduce);
73567414
}
@@ -8681,6 +8739,76 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
86818739
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
86828740
break;
86838741
}
8742+
case AArch64MachineCombinerPattern::SPLIT_LD: {
8743+
// Gather the initial load instructions, we will use them later to build the
8744+
// pattern.
8745+
MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8746+
MachineInstr *Lane1Load =
8747+
MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
8748+
MachineInstr *SubregToReg =
8749+
MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
8750+
MachineInstr *Lane0Load =
8751+
MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
8752+
const TargetRegisterClass *FPR128RegClass =
8753+
MRI.getRegClass(Root.getOperand(0).getReg());
8754+
8755+
// Some helpful lambdas to increase code reuse.
8756+
auto CreateImplicitDef = [&]() {
8757+
auto VirtReg = MRI.createVirtualRegister(FPR128RegClass);
8758+
auto DefInstr = BuildMI(MF, MIMetadata(Root),
8759+
TII->get(TargetOpcode::IMPLICIT_DEF), VirtReg);
8760+
InstrIdxForVirtReg.insert(std::make_pair(VirtReg, InsInstrs.size()));
8761+
InsInstrs.push_back(DefInstr);
8762+
return VirtReg;
8763+
};
8764+
auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
8765+
Register SrcRegister, unsigned Lane,
8766+
Register OffsetRegister) {
8767+
auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8768+
MachineInstrBuilder LoadIndexIntoRegister =
8769+
BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8770+
NewRegister)
8771+
.addReg(SrcRegister)
8772+
.addImm(Lane)
8773+
.addReg(OffsetRegister, getKillRegState(true));
8774+
InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8775+
InsInstrs.push_back(LoadIndexIntoRegister);
8776+
return NewRegister;
8777+
};
8778+
// To rewrite the pattern, we first need define new registers to
8779+
// load our results into.
8780+
Register ImplicitDefForReg0 = CreateImplicitDef();
8781+
Register ImplicitDefForReg1 = CreateImplicitDef();
8782+
8783+
// Load index 0 into register 0 lane 0.
8784+
Register Index0LoadReg = LoadLaneToRegister(
8785+
Lane0Load, ImplicitDefForReg0, 0, Lane0Load->getOperand(2).getReg());
8786+
DelInstrs.push_back(Lane0Load);
8787+
DelInstrs.push_back(SubregToReg);
8788+
8789+
// Load index 1 into register 1 lane 0.
8790+
Register Index1LoadReg = LoadLaneToRegister(
8791+
Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg());
8792+
DelInstrs.push_back(Lane1Load);
8793+
8794+
// Load index 2 into register 0 lane 1.
8795+
auto Index2LoadReg = LoadLaneToRegister(Lane2Load, Index0LoadReg, 1,
8796+
Lane2Load->getOperand(3).getReg());
8797+
DelInstrs.push_back(Lane2Load);
8798+
8799+
// Load index 3 into register 1 lane 1.
8800+
auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1,
8801+
Root.getOperand(3).getReg());
8802+
// Root will be deleted after this pattern is applied
8803+
8804+
// Create the zip instruction.
8805+
MachineInstrBuilder ZipInstr =
8806+
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8807+
Root.getOperand(0).getReg())
8808+
.addReg(Index2LoadReg)
8809+
.addReg(Index3LoadReg);
8810+
InsInstrs.push_back(ZipInstr);
8811+
}
86848812

86858813
} // end switch (Pattern)
86868814
// Record MUL and ADD/SUB for deletion

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
172172
FMULv8i16_indexed_OP2,
173173

174174
FNMADD,
175+
176+
SPLIT_LD,
175177
};
176178
class AArch64InstrInfo final : public AArch64GenInstrInfo {
177179
const AArch64RegisterInfo RI;

llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,19 @@ body: |
88
liveins: $x0, $x1, $x2, $x3, $x4
99
1010
; CHECK-LABEL: name: split_loads_to_fpr128
11-
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
11+
; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
1212
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
1313
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
1414
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
1515
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
16-
; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
17-
; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
18-
; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
19-
; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
20-
; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
21-
; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
16+
; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
17+
; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
18+
; CHECK-NEXT: [[LD0_0:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 0, killed [[COPY1]]
19+
; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 0, killed [[COPY2]]
20+
; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[LD0_0]], 1, killed [[COPY3]]
21+
; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1_0]], 1, killed [[COPY4]]
22+
; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
23+
; CHECK-NEXT: $q0 = COPY [[ZIP]]
2224
; CHECK-NEXT: RET_ReallyLR implicit $q0
2325
%0:gpr64common = COPY $x0
2426
%1:gpr64common = COPY $x1

0 commit comments

Comments
 (0)