Skip to content

Commit

Permalink
[CHR] Add a threshold for the code duplication
Browse files Browse the repository at this point in the history
ControlHeightReduction (CHR) clones the code region to reduce the
branches in the hot code path. The number of clones is linear to the
depth of the region.

Currently it does not have control over the code size increase. We are
seeing one ~9000 BB functions get expanded to ~250000 BBs, an 25x
increase. This creates a big compile time issue for the downstream
optimizations.

This patch adds a cap for number of clones for one region.

Differential Revision: https://reviews.llvm.org/D138333
  • Loading branch information
xur-llvm committed Nov 22, 2022
1 parent b816b52 commit 6327d26
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 16 deletions.
6 changes: 5 additions & 1 deletion llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -675,8 +675,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
FPM.addPass(InstCombinePass());
invokePeepholeEPCallbacks(FPM, Level);

// Don't add CHR pass for CSIRInstr build in PostLink as the profile
// is still the same as the PreLink compilation.
if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
(PGOOpt->Action == PGOOptions::IRUse ||
((PGOOpt->Action == PGOOptions::IRUse &&
(Phase != ThinOrFullLTOPhase::ThinLTOPostLink ||
PGOOpt->CSAction != PGOOptions::CSIRInstr)) ||
PGOOpt->Action == PGOOptions::SampleUse))
FPM.addPass(ControlHeightReductionPass());

Expand Down
66 changes: 51 additions & 15 deletions llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ using namespace llvm;

#define CHR_DEBUG(X) LLVM_DEBUG(X)

static cl::opt<bool> DisableCHR("disable-chr", cl::init(false), cl::Hidden,
cl::desc("Disable CHR for all functions"));

static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
cl::desc("Apply CHR for all functions"));

Expand All @@ -66,6 +69,10 @@ static cl::opt<std::string> CHRFunctionList(
"chr-function-list", cl::init(""), cl::Hidden,
cl::desc("Specify file to retrieve the list of functions to apply CHR to"));

static cl::opt<unsigned> CHRDupThreshsold(
"chr-dup-threshold", cl::init(3), cl::Hidden,
cl::desc("Max number of duplications by CHR for a region"));

static StringSet<> CHRModules;
static StringSet<> CHRFunctions;

Expand Down Expand Up @@ -339,23 +346,27 @@ class CHR {
BasicBlock *EntryBlock,
BasicBlock *NewEntryBlock,
ValueToValueMapTy &VMap);
void fixupBranchesAndSelects(CHRScope *Scope,
BasicBlock *PreEntryBlock,
BranchInst *MergedBR,
uint64_t ProfileCount);
void fixupBranch(Region *R,
CHRScope *Scope,
IRBuilder<> &IRB,
void fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock,
BranchInst *MergedBR, uint64_t ProfileCount);
void fixupBranch(Region *R, CHRScope *Scope, IRBuilder<> &IRB,
Value *&MergedCondition, BranchProbability &CHRBranchBias);
void fixupSelect(SelectInst* SI,
CHRScope *Scope,
IRBuilder<> &IRB,
void fixupSelect(SelectInst *SI, CHRScope *Scope, IRBuilder<> &IRB,
Value *&MergedCondition, BranchProbability &CHRBranchBias);
void addToMergedCondition(bool IsTrueBiased, Value *Cond,
Instruction *BranchOrSelect,
CHRScope *Scope,
IRBuilder<> &IRB,
Value *&MergedCondition);
Instruction *BranchOrSelect, CHRScope *Scope,
IRBuilder<> &IRB, Value *&MergedCondition);
unsigned getRegionDuplicationCount(const Region *R) {
unsigned Count = 0;
// Find out how many times region R is cloned. Note that if the parent
// of R is cloned, R is also cloned, but R's clone count is not updated
// from the clone of the parent. We need to accumlate all the counts
// from the ancestors to get the clone count.
while (R) {
Count += DuplicationCount[R];
R = R->getParent();
}
return Count;
}

Function &F;
BlockFrequencyInfo &BFI;
Expand All @@ -379,6 +390,8 @@ class CHR {
DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
// All the scopes.
DenseSet<CHRScope *> Scopes;
// This maps records how many times this region is cloned.
DenseMap<const Region *, unsigned> DuplicationCount;
};

} // end anonymous namespace
Expand All @@ -396,7 +409,10 @@ raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
return OS;
}

static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) {
if (DisableCHR)
return false;

if (ForceCHR)
return true;

Expand Down Expand Up @@ -1666,6 +1682,26 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");

assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");

for (RegInfo &RI : Scope->RegInfos) {
const Region *R = RI.R;
unsigned Duplication = getRegionDuplicationCount(R);
dbgs() << "Dup count for R=" << R << " is " << Duplication << "\n";
if (Duplication >= CHRDupThreshsold) {
CHR_DEBUG(dbgs() << "Reached the dup threshold of " << Duplication
<< " for this region");
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "DupThresholdReached",
R->getEntry()->getTerminator())
<< "Reached the duplication threshold for the region";
});
return;
}
}
for (RegInfo &RI : Scope->RegInfos) {
DuplicationCount[RI.R]++;
}

Region *FirstRegion = Scope->RegInfos[0].R;
BasicBlock *EntryBlock = FirstRegion->getEntry();
Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
Expand Down
195 changes: 195 additions & 0 deletions llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
; Test case for capping the cloning in CHR.
; RUN: opt < %s -passes='require<profile-summary>,function(chr)' -chr-dup-threshold=2 -S | FileCheck %s

; c sources for the test case.
; extern void foo(int);
; __attribute__((noinline)) void goo(int r, int s, int t) {
; if ((r & 2) != 0) {
; if ((s & 2) != 0) {
; if ((t & 2) != 0) {
; foo(111);
; }
; if ((t & 4) != 0) {
; foo(112);
; }
; }
; if ((s & 4) != 0) {
; if ((t & 2) != 0) {
; foo(121);
; }
; if ((t & 4) != 0) {
; foo(122);
; }
; }
; }
; if ((r & 4) != 0) {
; if ((s & 2) != 0) {
; if ((t & 2) != 0) {
; foo(211);
; }
; if ((t & 4) != 0) {
; foo(212);
; }
; }
; if ((s & 4) != 0) {
; if ((t & 2) != 0) {
; foo(221);
; }
; if ((t & 4) != 0) {
; foo(222);
; }
; }
; }
; }
;
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define dso_local void @goo(i32 noundef %r, i32 noundef %s, i32 noundef %t) !prof !34 {
entry:
%and = and i32 %r, 2
%cmp.not = icmp eq i32 %and, 0
br i1 %cmp.not, label %if.end24, label %if.then, !prof !35

if.then:
%and1 = and i32 %s, 2
%cmp2.not = icmp eq i32 %and1, 0
br i1 %cmp2.not, label %if.end11, label %if.then3, !prof !35

if.then3:
%and4 = and i32 %t, 2
%cmp5.not = icmp eq i32 %and4, 0
br i1 %cmp5.not, label %if.end, label %if.then6, !prof !35

if.then6:
tail call void @foo(i32 noundef 111)
br label %if.end

if.end:
%and7 = and i32 %t, 4
%cmp8.not = icmp eq i32 %and7, 0
br i1 %cmp8.not, label %if.end11, label %if.then9, !prof !35

if.then9:
tail call void @foo(i32 noundef 112)
br label %if.end11

if.end11:
%and12 = and i32 %s, 4
%cmp13.not = icmp eq i32 %and12, 0
br i1 %cmp13.not, label %if.end24, label %if.then14, !prof !35

if.then14:
%and15 = and i32 %t, 2
%cmp16.not = icmp eq i32 %and15, 0
br i1 %cmp16.not, label %if.end18, label %if.then17, !prof !35

if.then17:
tail call void @foo(i32 noundef 121)
br label %if.end18

if.end18:
%and19 = and i32 %t, 4
%cmp20.not = icmp eq i32 %and19, 0
br i1 %cmp20.not, label %if.end24, label %if.then21, !prof !35

if.then21:
tail call void @foo(i32 noundef 122)
br label %if.end24

if.end24:
%and25 = and i32 %r, 4
%cmp26.not = icmp eq i32 %and25, 0
br i1 %cmp26.not, label %if.end52, label %if.then27, !prof !35

if.then27:
%and28 = and i32 %s, 2
%cmp29.not = icmp eq i32 %and28, 0
br i1 %cmp29.not, label %if.end39, label %if.then30, !prof !35

if.then30:
%and31 = and i32 %t, 2
%cmp32.not = icmp eq i32 %and31, 0
br i1 %cmp32.not, label %if.end34, label %if.then33, !prof !35

if.then33:
tail call void @foo(i32 noundef 211)
br label %if.end34

if.end34:
%and35 = and i32 %t, 4
%cmp36.not = icmp eq i32 %and35, 0
br i1 %cmp36.not, label %if.end39, label %if.then37, !prof !35

if.then37:
tail call void @foo(i32 noundef 212)
br label %if.end39

if.end39:
%and40 = and i32 %s, 4
%cmp41.not = icmp eq i32 %and40, 0
br i1 %cmp41.not, label %if.end52, label %if.then42, !prof !35

if.then42:
%and43 = and i32 %t, 2
%cmp44.not = icmp eq i32 %and43, 0
br i1 %cmp44.not, label %if.end46, label %if.then45, !prof !35

if.then45:
tail call void @foo(i32 noundef 221)
br label %if.end46

if.end46:
%and47 = and i32 %t, 4
%cmp48.not = icmp eq i32 %and47, 0
br i1 %cmp48.not, label %if.end52, label %if.then49, !prof !35

if.then49:
tail call void @foo(i32 noundef 222)
br label %if.end52

if.end52:
ret void
}

; CHECK-LABEL: goo
; CHECK-COUNT-3: {{.*}}.split:
; CHECK-NOT: {{.*}}.split:

declare void @foo(i32 noundef)

!llvm.module.flags = !{!4}

!4 = !{i32 1, !"ProfileSummary", !5}
!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
!6 = !{!"ProfileFormat", !"InstrProf"}
!7 = !{!"TotalCount", i64 2400001}
!8 = !{!"MaxCount", i64 800000}
!9 = !{!"MaxInternalCount", i64 100000}
!10 = !{!"MaxFunctionCount", i64 800000}
!11 = !{!"NumCounts", i64 19}
!12 = !{!"NumFunctions", i64 4}
!13 = !{!"IsPartialProfile", i64 0}
!14 = !{!"PartialProfileRatio", double 0.000000e+00}
!15 = !{!"DetailedSummary", !16}
!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
!17 = !{i32 10000, i64 800000, i32 1}
!18 = !{i32 100000, i64 800000, i32 1}
!19 = !{i32 200000, i64 800000, i32 1}
!20 = !{i32 300000, i64 800000, i32 1}
!21 = !{i32 400000, i64 100000, i32 17}
!22 = !{i32 500000, i64 100000, i32 17}
!23 = !{i32 600000, i64 100000, i32 17}
!24 = !{i32 700000, i64 100000, i32 17}
!25 = !{i32 800000, i64 100000, i32 17}
!26 = !{i32 900000, i64 100000, i32 17}
!27 = !{i32 950000, i64 100000, i32 17}
!28 = !{i32 990000, i64 100000, i32 17}
!29 = !{i32 999000, i64 100000, i32 17}
!30 = !{i32 999900, i64 100000, i32 17}
!31 = !{i32 999990, i64 100000, i32 17}
!32 = !{i32 999999, i64 100000, i32 17}
!34 = !{!"function_entry_count", i64 100000}
!35 = !{!"branch_weights", i32 0, i32 100000}
!36 = !{!"function_entry_count", i64 1}
!37 = !{!"branch_weights", i32 100000, i32 1}

0 comments on commit 6327d26

Please sign in to comment.