-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AArch64][Machine-Combiner] Split loads into lanes of neon vectors into multiple vectors when possible #142941
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Jonathan Cohen (jcohen-apple) ChangesThis changes optimizes gather-like sequences, where we load values separately into lanes of a neon vector. Since each load has serial dependency, when performing multiple i32 loads into a 128 bit vector, it is more profitable on wide processors to load into separate vector registers and zip them. This shows a small speedup on internal workloads at Apple. Full diff: https://github.com/llvm/llvm-project/pull/142941.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 54e2a009b464d..6d3b39ae01c99 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -461,7 +461,7 @@ bool MachineCombiner::preservesResourceLen(
"Length\n");
return ResLenAfterCombine <=
- ResLenBeforeCombine + TII->getExtendResourceLenLimit();
+ ResLenBeforeCombine + TII->getExtendResourceLenLimit();
}
/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a229b71b4b6e7..99bfd7eb5755c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7317,11 +7319,63 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Search for patterns where we use LD1i32 instructions to load into
+/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+ const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+ const TargetRegisterInfo *TRI =
+ Root.getMF()->getSubtarget().getRegisterInfo();
+ // Enable this only on Darwin targets, where it should be profitable. Other
+ // targets can remove this check if it is profitable there as well.
+ if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+ return false;
+
+ // The pattern searches for loads into single lanes.
+ if (Root.getOpcode() != AArch64::LD1i32)
+ return false;
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != 3)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ SmallSet<unsigned, 4> RemainingLanes({1, 2});
+ while (RemainingLanes.begin() != RemainingLanes.end() &&
+ Root.getOpcode() == AArch64::LD1i32 &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an i32 into the first lane.
+ auto Lane0Load = CurrInstr->getOperand(2).getReg();
+ if (TRI->getRegSizeInBits(Lane0Load, MRI) != 32)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0Load))
+ return false;
+
+ Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+ return true;
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::SPLIT_LD:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7351,6 +7405,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8681,6 +8739,73 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::SPLIT_LD: {
+ // Gather the initial load instructions, we will use them later to build the pattern.
+ MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ MachineInstr *Lane1Load =
+ MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
+ MachineInstr *SubregToReg =
+ MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
+ MachineInstr *Lane0Load =
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
+ const TargetRegisterClass *FPR128RegClass = MRI.getRegClass(Root.getOperand(0).getReg());
+
+ // Some helpful lambdas to increase code reuse.
+ auto CreateImplicitDef = [&]() {
+ auto VirtReg = MRI.createVirtualRegister(FPR128RegClass);
+ auto DefInstr = BuildMI(MF, MIMetadata(Root), TII->get(TargetOpcode::IMPLICIT_DEF), VirtReg);
+ InstrIdxForVirtReg.insert(std::make_pair(VirtReg, InsInstrs.size()));
+ InsInstrs.push_back(DefInstr);
+ return VirtReg;
+ };
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+ // To rewrite the pattern, we first need define new registers to
+ // load our results into.
+ Register ImplicitDefForReg0 = CreateImplicitDef();
+ Register ImplicitDefForReg1 = CreateImplicitDef();
+
+ // Load index 0 into register 0 lane 0.
+ Register Index0LoadReg = LoadLaneToRegister(
+ Lane0Load, ImplicitDefForReg0, 0, Lane0Load->getOperand(2).getReg());
+ DelInstrs.push_back(Lane0Load);
+ DelInstrs.push_back(SubregToReg);
+
+ // Load index 1 into register 1 lane 0.
+ Register Index1LoadReg = LoadLaneToRegister(
+ Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg());
+ DelInstrs.push_back(Lane1Load);
+
+ // Load index 2 into register 0 lane 1.
+ auto Index2LoadReg = LoadLaneToRegister(Lane2Load, Index0LoadReg, 1,
+ Lane2Load->getOperand(3).getReg());
+ DelInstrs.push_back(Lane2Load);
+
+ // Load index 3 into register 1 lane 1.
+ auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1,
+ Root.getOperand(3).getReg());
+ // Root will be deleted after this pattern is applied
+
+ // Create the zip instruction.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(Index2LoadReg)
+ .addReg(Index3LoadReg);
+ InsInstrs.push_back(ZipInstr);
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 0ffaca9af4006..65c8e4fb46e93 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ SPLIT_LD,
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
new file mode 100644
index 0000000000000..f1dbeb396c5dd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+
+---
+name: split_loads_to_fpr128
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[LD0_0:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 0, killed [[COPY1]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 0, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[LD0_0]], 1, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1_0]], 1, killed [[COPY4]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+ %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, killed %2
+ %8:fpr128 = LD1i32 %7, 2, killed %3
+ %9:fpr128 = LD1i32 %8, 3, killed %4
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
|
7cfd271
to
ea2546a
Compare
d1a0170
to
5b627a0
Compare
ecceba8
to
9477afe
Compare
9477afe
to
e30245a
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
c6d7c8c
to
4ebe9c9
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks decent.
Should it bail out for minsize?
Should it support v8i8 / v41i6 too?
Added an early exit for minsize, good catch. |
4ebe9c9
to
8f68890
Compare
Opcode = AArch64::LDRHui; | ||
break; | ||
case 16: | ||
Opcode = AArch64::LDRBui; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For 8 and 16 lanes, it may be worth splitting into more than 2 separate chains of loads?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Like I wrote above to @davemgreen - it kicks in less often because of the additional use of load ports. I wanted to keep the pattern (relatively) simple, so decided not to do that right now. I can add it in a follow up PR if you think it's a good idea.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah sounds good as follow-up
- Early exit if optimizing for size - Fix loop condition to check if CurrInstr is not null - use .empty() instead of begin() != end() - Rename pattern enum
99a3dfd
to
ad86eaa
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
Opcode = AArch64::LDRHui; | ||
break; | ||
case 16: | ||
Opcode = AArch64::LDRBui; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah sounds good as follow-up
@jcohen-apple Given that we have seemingly 2 separate issues from committing this (the machine verifier errors and the miscompile in #150004) we should consider reverting this to get the tree back into a working state. If it was just the verifier errors I'd expect a quick response on #149585 to be enough, but it doesn't look like we have a fix yet for the miscompile. |
We did seem to forget about aliasing. Reverting back to a good point and recommitting a fixed version might be the best way forward. |
…egs to multiple vectors (#142941)" Reverting due to reported miscompiles, will reland with a fix.
…egs to multiple vectors (llvm#142941)" (llvm#150505) Reverting due to reported miscompiles, will reland once it is fixed.
…egs to multiple vectors (llvm#142941)" (llvm#150505) Reverting due to reported miscompiles, will reland once it is fixed.
This changes optimizes gather-like sequences, where we load values separately into lanes of a neon vector. Since each load has serial dependency, when performing multiple i32 loads into a 128 bit vector, it is more profitable on wide processors to load into separate vector registers and zip them. This shows a small speedup on internal workloads at Apple.