Skip to content

Commit 5b627a0

Browse files
committed
Apply pattern to basic case of 4 i64 loads into fpr128 register
1 parent 36fc378 commit 5b627a0

File tree

3 files changed

+144
-7
lines changed

3 files changed

+144
-7
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "Utils/AArch64BaseInfo.h"
2121
#include "llvm/ADT/ArrayRef.h"
2222
#include "llvm/ADT/STLExtras.h"
23+
#include "llvm/ADT/SmallSet.h"
2324
#include "llvm/ADT/SmallVector.h"
2425
#include "llvm/CodeGen/CFIInstBuilder.h"
2526
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
3536
#include "llvm/CodeGen/MachineRegisterInfo.h"
3637
#include "llvm/CodeGen/RegisterScavenging.h"
3738
#include "llvm/CodeGen/StackMaps.h"
39+
#include "llvm/CodeGen/TargetOpcodes.h"
3840
#include "llvm/CodeGen/TargetRegisterInfo.h"
3941
#include "llvm/CodeGen/TargetSubtargetInfo.h"
4042
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7317,6 +7319,57 @@ static bool getMiscPatterns(MachineInstr &Root,
73177319
return false;
73187320
}
73197321

7322+
/// Search for patterns where we use LD1i32 instructions to load into
7323+
/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
7324+
/// by loading into 2 Neon registers instead.
7325+
static bool getLoadPatterns(MachineInstr &Root,
7326+
SmallVectorImpl<unsigned> &Patterns) {
7327+
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
7328+
const TargetRegisterInfo *TRI =
7329+
Root.getMF()->getSubtarget().getRegisterInfo();
7330+
// Enable this only on Darwin targets, where it should be profitable. Other
7331+
// targets can remove this check if it is profitable there as well.
7332+
if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
7333+
return false;
7334+
7335+
// The pattern searches for loads into single lanes.
7336+
if (Root.getOpcode() != AArch64::LD1i32)
7337+
return false;
7338+
7339+
// The root of the pattern must load into the last lane of the vector.
7340+
if (Root.getOperand(2).getImm() != 3)
7341+
return false;
7342+
7343+
// Check that we have load into all lanes except lane 0.
7344+
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7345+
SmallSet<unsigned, 4> RemainingLanes({1, 2});
7346+
while (RemainingLanes.begin() != RemainingLanes.end() &&
7347+
CurrInstr->getOpcode() == AArch64::LD1i32 &&
7348+
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
7349+
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
7350+
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7351+
}
7352+
7353+
if (!RemainingLanes.empty())
7354+
return false;
7355+
7356+
// Match the SUBREG_TO_REG sequence.
7357+
if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
7358+
return false;
7359+
7360+
// Verify that the subreg to reg loads an i32 into the first lane.
7361+
auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
7362+
if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
7363+
return false;
7364+
7365+
// Verify that it also has a single non debug use.
7366+
if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
7367+
return false;
7368+
7369+
Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
7370+
return true;
7371+
}
7372+
73207373
CombinerObjective
73217374
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
73227375
switch (Pattern) {
@@ -7351,6 +7404,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
73517404
if (getMiscPatterns(Root, Patterns))
73527405
return true;
73537406

7407+
// Load patterns
7408+
if (getLoadPatterns(Root, Patterns))
7409+
return true;
7410+
73547411
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
73557412
DoRegPressureReduce);
73567413
}
@@ -8681,6 +8738,66 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
86818738
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
86828739
break;
86838740
}
8741+
case AArch64MachineCombinerPattern::SPLIT_LD: {
8742+
// Gather the initial load instructions, we will use them later to build the
8743+
// pattern.
8744+
MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8745+
MachineInstr *Lane1Load =
8746+
MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
8747+
MachineInstr *SubregToReg =
8748+
MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
8749+
const TargetRegisterClass *FPR128RegClass =
8750+
MRI.getRegClass(Root.getOperand(0).getReg());
8751+
8752+
auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
8753+
Register SrcRegister, unsigned Lane,
8754+
Register OffsetRegister) {
8755+
auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8756+
MachineInstrBuilder LoadIndexIntoRegister =
8757+
BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8758+
NewRegister)
8759+
.addReg(SrcRegister)
8760+
.addImm(Lane)
8761+
.addReg(OffsetRegister, getKillRegState(true));
8762+
InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8763+
InsInstrs.push_back(LoadIndexIntoRegister);
8764+
return NewRegister;
8765+
};
8766+
8767+
// To rewrite the pattern, we first need define a new register to
8768+
// load our results into.
8769+
auto ImplicitDefForReg1 = MRI.createVirtualRegister(FPR128RegClass);
8770+
auto DefInstr =
8771+
BuildMI(MF, MIMetadata(Root), TII->get(TargetOpcode::IMPLICIT_DEF),
8772+
ImplicitDefForReg1);
8773+
InstrIdxForVirtReg.insert(
8774+
std::make_pair(ImplicitDefForReg1, InsInstrs.size()));
8775+
InsInstrs.push_back(DefInstr);
8776+
8777+
// Load index 1 into register 1 lane 0.
8778+
Register Index1LoadReg = LoadLaneToRegister(
8779+
Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg());
8780+
DelInstrs.push_back(Lane1Load);
8781+
8782+
// Load index 2 into register 0 lane 1.
8783+
auto Index2LoadReg =
8784+
LoadLaneToRegister(Lane2Load, SubregToReg->getOperand(0).getReg(), 1,
8785+
Lane2Load->getOperand(3).getReg());
8786+
DelInstrs.push_back(Lane2Load);
8787+
8788+
// Load index 3 into register 1 lane 1.
8789+
auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1,
8790+
Root.getOperand(3).getReg());
8791+
// Root will be deleted after this pattern is applied
8792+
8793+
// Create the zip instruction.
8794+
MachineInstrBuilder ZipInstr =
8795+
BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8796+
Root.getOperand(0).getReg())
8797+
.addReg(Index2LoadReg)
8798+
.addReg(Index3LoadReg);
8799+
InsInstrs.push_back(ZipInstr);
8800+
}
86848801

86858802
} // end switch (Pattern)
86868803
// Record MUL and ADD/SUB for deletion

llvm/lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
172172
FMULv8i16_indexed_OP2,
173173

174174
FNMADD,
175+
176+
SPLIT_LD,
175177
};
176178
class AArch64InstrInfo final : public AArch64GenInstrInfo {
177179
const AArch64RegisterInfo RI;

llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,19 @@ body: |
88
liveins: $x0, $x1, $x2, $x3, $x4
99
1010
; CHECK-LABEL: name: split_loads_to_fpr128
11-
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
11+
; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
1212
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
1313
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
1414
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
1515
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
16-
; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
17-
; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
18-
; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
19-
; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
20-
; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
21-
; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
16+
; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
17+
; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]]
18+
; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
19+
; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 0, killed [[COPY2]]
20+
; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]]
21+
; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1_0]], 1, killed [[COPY4]]
22+
; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
23+
; CHECK-NEXT: $q0 = COPY [[ZIP]]
2224
; CHECK-NEXT: RET_ReallyLR implicit $q0
2325
%0:gpr64common = COPY $x0
2426
%1:gpr64common = COPY $x1
@@ -32,3 +34,19 @@ body: |
3234
%9:fpr128 = LD1i32 %8, 3, killed %4
3335
$q0 = COPY %9
3436
RET_ReallyLR implicit $q0
37+
38+
---
39+
name: negative_pattern
40+
body: |
41+
bb.0.entry:
42+
liveins: $x0, $x1
43+
44+
; CHECK-LABEL: name: negative_pattern
45+
; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
46+
; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
47+
48+
%0:gpr64common = COPY $x0
49+
%1:fpr128 = LDRQui $x1, 0
50+
%2:fpr128 = LD1i32 %1, 3, %0
51+
$q0 = COPY %2
52+
RET_ReallyLR implicit $q0

0 commit comments

Comments
 (0)