Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EarlyCSE] Compare GEP instructions based on offset #65875

Merged
merged 9 commits into from
Sep 19, 2023
172 changes: 141 additions & 31 deletions llvm/lib/Transforms/Scalar/EarlyCSE.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ STATISTIC(NumCSE, "Number of instructions CSE'd");
STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
STATISTIC(NumDSE, "Number of trivial dead stores removed");

DEBUG_COUNTER(CSECounter, "early-cse",
Expand Down Expand Up @@ -143,11 +144,11 @@ struct SimpleValue {
!CI->getFunction()->isPresplitCoroutine();
}
return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
isa<FreezeInst>(Inst);
}
};

Expand Down Expand Up @@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
IVI->getOperand(1),
hash_combine_range(IVI->idx_begin(), IVI->idx_end()));

assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
isa<FreezeInst>(Inst)) &&
assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
"Invalid/unknown instruction");

// Handle intrinsics with commutative operands.
Expand Down Expand Up @@ -548,11 +548,81 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
// currently executing, so conservatively return false if they are in
// different basic blocks.
if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
return false;
return false;

return LHSI->isIdenticalTo(RHSI);
}

//===----------------------------------------------------------------------===//
// GEPValue
//===----------------------------------------------------------------------===//

namespace {

struct GEPValue {
Instruction *Inst;
std::optional<int64_t> ConstantOffset;

GEPValue(Instruction *I) : Inst(I) {
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
}

GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
: Inst(I), ConstantOffset(ConstantOffset) {
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
}

bool isSentinel() const {
return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
}

static bool canHandle(Instruction *Inst) {
return isa<GetElementPtrInst>(Inst);
}
};

} // namespace

namespace llvm {

template <> struct DenseMapInfo<GEPValue> {
static inline GEPValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}

static inline GEPValue getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}

static unsigned getHashValue(const GEPValue &Val);
static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
};

} // end namespace llvm

unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
auto *GEP = cast<GetElementPtrInst>(Val.Inst);
if (Val.ConstantOffset.has_value())
return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
Val.ConstantOffset.value());
return hash_combine(
GEP->getOpcode(),
hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
}

bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
if (LHS.isSentinel() || RHS.isSentinel())
return LHS.Inst == RHS.Inst;
auto *LGEP = cast<GetElementPtrInst>(LHS.Inst);
auto *RGEP = cast<GetElementPtrInst>(RHS.Inst);
if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
return false;
if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
return LGEP->isIdenticalToWhenDefined(RGEP);
}

//===----------------------------------------------------------------------===//
// EarlyCSE implementation
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -647,6 +717,13 @@ class EarlyCSE {
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;

using GEPMapAllocatorTy =
RecyclingAllocator<BumpPtrAllocator,
ScopedHashTableVal<GEPValue, Value *>>;
using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
GEPMapAllocatorTy>;
GEPHTType AvailableGEPs;

/// This is the current generation of the memory value.
unsigned CurrentGeneration = 0;

Expand All @@ -667,9 +744,11 @@ class EarlyCSE {
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
: Scope(AvailableValues), LoadScope(AvailableLoads),
InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
GEPHTType &AvailableGEPs)
: Scope(AvailableValues), LoadScope(AvailableLoads),
InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
GEPScope(AvailableGEPs) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;

Expand All @@ -678,6 +757,7 @@ class EarlyCSE {
LoadHTType::ScopeTy LoadScope;
InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
GEPHTType::ScopeTy GEPScope;
};

// Contains all the needed information to create a stack for doing a depth
Expand All @@ -688,13 +768,13 @@ class EarlyCSE {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
DomTreeNode::const_iterator child,
DomTreeNode::const_iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
EndIter(end),
Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
AvailableCalls)
{}
AvailableCalls, AvailableGEPs) {}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;

Expand Down Expand Up @@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
return Result;
}

static void combineIRFlags(Instruction &From, Value *To) {
if (auto *I = dyn_cast<Instruction>(To)) {
// If I being poison triggers UB, there is no need to drop those
// flags. Otherwise, only retain flags present on both I and Inst.
// TODO: Currently some fast-math flags are not treated as
// poison-generating even though they should. Until this is fixed,
// always retain flags present on both I and Inst for floating point
// instructions.
if (isa<FPMathOperator>(I) ||
(I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
I->andIRFlags(&From);
}
}

bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
const ParseMemoryInst &Later) {
// Can we remove Earlier store because of Later store?
Expand Down Expand Up @@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
continue;
}
if (auto *I = dyn_cast<Instruction>(V)) {
// If I being poison triggers UB, there is no need to drop those
// flags. Otherwise, only retain flags present on both I and Inst.
// TODO: Currently some fast-math flags are not treated as
// poison-generating even though they should. Until this is fixed,
// always retain flags present on both I and Inst for floating point
// instructions.
if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
I->andIRFlags(&Inst);
}
combineIRFlags(Inst, V);
Inst.replaceAllUsesWith(V);
salvageKnowledge(&Inst, &AC);
removeMSSA(Inst);
Expand Down Expand Up @@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}

// Compare GEP instructions based on offset.
if (GEPValue::canHandle(&Inst)) {
auto *GEP = cast<GetElementPtrInst>(&Inst);
APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset)
? Offset.trySExtValue()
: std::nullopt);
if (Value *V = AvailableGEPs.lookup(GEPVal)) {
LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << " to: " << *V
<< '\n');
combineIRFlags(Inst, V);
Inst.replaceAllUsesWith(V);
salvageKnowledge(&Inst, &AC);
removeMSSA(Inst);
Inst.eraseFromParent();
Changed = true;
++NumCSEGEP;
continue;
}

// Otherwise, just remember that we have this GEP.
AvailableGEPs.insert(GEPVal, &Inst);
continue;
}

// A release fence requires that all stores complete before it, but does
// not prevent the reordering of following loads 'before' the fence. As a
// result, we don't need to consider it as writing to memory and don't need
Expand Down Expand Up @@ -1675,7 +1785,7 @@ bool EarlyCSE::run() {
// Process the root node.
nodesToProcess.push_back(new StackNode(
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
CurrentGeneration, DT.getRootNode(),
AvailableGEPs, CurrentGeneration, DT.getRootNode(),
DT.getRootNode()->begin(), DT.getRootNode()->end()));

assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
Expand All @@ -1698,10 +1808,10 @@ bool EarlyCSE::run() {
} else if (NodeToProcess->childIter() != NodeToProcess->end()) {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
AvailableCalls, NodeToProcess->childGeneration(),
child, child->begin(), child->end()));
nodesToProcess.push_back(new StackNode(
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
AvailableGEPs, NodeToProcess->childGeneration(), child,
child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
Expand Down
44 changes: 44 additions & 0 deletions llvm/test/Transforms/EarlyCSE/gep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s

%T1 = type { i64, i64, i64 }

declare void @use_vec(<4 x ptr>);

define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
; CHECK-NEXT: [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
; CHECK-NEXT: [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
; CHECK-NEXT: [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
; CHECK-NEXT: [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
; CHECK-NEXT: [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V]])
; CHECK-NEXT: [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V2]])
; CHECK-NEXT: ret void
;
%s1a = getelementptr i8, ptr %a, i64 8
%s1av = load i64, ptr %s1a
%s1b = getelementptr inbounds i8, ptr %a, i64 8
%s1bv = load i64, ptr %s1b
%s1c = getelementptr %T1, ptr %a, i64 0, i32 1
%s1cv = load i64, ptr %s1c
%n1d = getelementptr i8, ptr %a, i64 7
%n1dv = load i64, ptr %n1d
%s1e = getelementptr i64, ptr %a, i64 1
%s1ev = load i64, ptr %s1e
%s1f = getelementptr i32, ptr %a, i64 2
%s1fv = load i64, ptr %s1f
%n1g = getelementptr i32, ptr %a, i64 1
%n1gv = load i64, ptr %n1g
%n1h = getelementptr i8, ptr %a, i64 %i
%n1hv = load i64, ptr %n1h

%v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
call void @use_vec(<4 x ptr> %v)
%v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
call void @use_vec(<4 x ptr> %v2)
ret void
}
44 changes: 44 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt < %s -O3 -S | FileCheck %s

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }

define void @foo(ptr %a, <32 x i8> %_0) #0 {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: start:
; CHECK-NEXT: store <32 x i8> [[_0]], ptr [[A]], align 1
; CHECK-NEXT: ret void
;
start:
%z = alloca %Zip, align 8
%sroa_1 = getelementptr i8, ptr %z, i64 16
store <32 x i8> %_0, ptr %sroa_1, align 8
%len_ = getelementptr i8, ptr %z, i64 56
store i64 32, ptr %len_, align 8
%_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
%_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
%len = load i64, ptr %_2, align 8
%_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
br label %body

body: ; preds = %body, %start
%_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
%idx = phi i64 [ %idx_, %body ], [ 0, %start ]
%_34i = getelementptr i8, ptr %_34, i64 1
%idx_ = add i64 %idx, 1
store i64 0, ptr %_1, align 8
%_24 = getelementptr i8, ptr %_10, i64 %idx
%_18 = load i8, ptr %_24, align 1
store i8 %_18, ptr %_34, align 1
%_6 = icmp eq i64 %len, %idx_
br i1 %_6, label %exit, label %body

exit: ; preds = %body
ret void
}

attributes #0 = { "target-cpu"="znver3" }