diff --git a/docs/design/coreclr/jit/ryujit-overview.md b/docs/design/coreclr/jit/ryujit-overview.md index cdb17002ee1974..5e63d38e98f664 100644 --- a/docs/design/coreclr/jit/ryujit-overview.md +++ b/docs/design/coreclr/jit/ryujit-overview.md @@ -222,6 +222,7 @@ The top-level function of interest is `Compiler::compCompile`. It invokes the fo | [Common Subexpression Elimination (CSE)](#cse) | Elimination of redundant subexressions based on value numbers. | | [Assertion Propagation](#assertion-propagation) | Utilizes value numbers to propagate and transform based on properties such as non-nullness. | | [Range analysis](#range-analysis) | Eliminate array index range checks based on value numbers and assertions | +| [Induction variable optimization](#iv-opts) | Optimize induction variables used inside natural loops based on scalar evolution analysis | | [VN-based dead store elimination](#vn-based-dead-store-elimination) | Eliminate stores that do not change the value of a local. | | [If conversion](#if-conversion) | Transform conditional definitions into `GT_SELECT` operators. | | [Rationalization](#rationalization) | Flowgraph order changes from `FGOrderTree` to `FGOrderLinear`. All `GT_COMMA` nodes are transformed. | @@ -347,6 +348,11 @@ reused. Utilizes value numbers to propagate and transform based on properties such as non-nullness. +### Induction variable optimization + +Performs scalar evolution analysis and utilized it to optimize induction variables inside loops. +Currently this entails IV widening which is done on x64 only. + ### Range analysis Optimize array index range checks based on value numbers and assertions. diff --git a/docs/design/coreclr/jit/ryujit-tutorial.md b/docs/design/coreclr/jit/ryujit-tutorial.md index 34466e45afbcdc..ec900ccc8cd937 100644 --- a/docs/design/coreclr/jit/ryujit-tutorial.md +++ b/docs/design/coreclr/jit/ryujit-tutorial.md @@ -447,6 +447,10 @@ This is the same diagram as before, but with additional links to indicate execut - Determine initial value for dependent phis - Eliminate checks where the range of the index is within the check range +### Induction Variable Optimization +- Perform scalar evolution analysis to describe values of IR nodes inside loops +- Perform IV widening on x64 to avoid unnecessary zero extensions for array/span indexing + ## RyuJIT Back-End ### Rationalization diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt index ae08a27e4c00aa..6e114f0f04a119 100644 --- a/src/coreclr/jit/CMakeLists.txt +++ b/src/coreclr/jit/CMakeLists.txt @@ -94,7 +94,6 @@ set( JIT_SOURCES bitset.cpp block.cpp buildstring.cpp - layout.cpp codegencommon.cpp codegenlinear.cpp compiler.cpp @@ -123,14 +122,15 @@ set( JIT_SOURCES gentree.cpp gschecks.cpp hashbv.cpp - hwintrinsic.cpp + helperexpansion.cpp hostallocator.cpp + hwintrinsic.cpp ifconversion.cpp - helperexpansion.cpp - indirectcalltransformer.cpp - importercalls.cpp importer.cpp + importercalls.cpp importervectorization.cpp + indirectcalltransformer.cpp + inductionvariableopts.cpp inline.cpp inlinepolicy.cpp instr.cpp @@ -138,6 +138,7 @@ set( JIT_SOURCES jiteh.cpp jithashtable.cpp jitmetadata.cpp + layout.cpp lclmorph.cpp lclvars.cpp likelyclass.cpp @@ -152,7 +153,6 @@ set( JIT_SOURCES objectalloc.cpp optcse.cpp optimizebools.cpp - switchrecognition.cpp optimizer.cpp patchpoint.cpp phase.cpp @@ -165,6 +165,7 @@ set( JIT_SOURCES regalloc.cpp registerargconvention.cpp regset.cpp + scev.cpp scopeinfo.cpp sideeffects.cpp sm.cpp @@ -173,6 +174,7 @@ set( JIT_SOURCES ssabuilder.cpp ssarenamestate.cpp stacklevelsetter.cpp + switchrecognition.cpp treelifeupdater.cpp unwind.cpp utils.cpp @@ -359,6 +361,7 @@ set( JIT_HEADERS registerargconvention.h register.h regset.h + scev.h sideeffects.h simd.h simdashwintrinsic.h diff --git a/src/coreclr/jit/clrjit.natvis b/src/coreclr/jit/clrjit.natvis index 95dd3dc305689b..98c374bea8f33f 100644 --- a/src/coreclr/jit/clrjit.natvis +++ b/src/coreclr/jit/clrjit.natvis @@ -86,6 +86,11 @@ Documentation for VS debugger format specifiers: https://docs.microsoft.com/en-u {gtTreeID, d}: [{gtOper,en}, {gtType,en} V{((GenTreeLclFld*)this)->_gtLclNum,u}[+{((GenTreeLclFld*)this)->m_lclOffs,u}]] + + + [{Oper,en}, {Type,en}] + + LinearScan diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 9fded7a13ccb0c..60b1a316c114aa 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4893,6 +4893,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl bool doValueNum = true; bool doLoopHoisting = true; bool doCopyProp = true; + bool doOptimizeIVs = true; bool doBranchOpt = true; bool doCse = true; bool doAssertionProp = true; @@ -4905,6 +4906,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl doSsa = (JitConfig.JitDoSsa() != 0); doEarlyProp = doSsa && (JitConfig.JitDoEarlyProp() != 0); doValueNum = doSsa && (JitConfig.JitDoValueNumber() != 0); + doOptimizeIVs = doSsa && (JitConfig.JitDoOptimizeIVs() != 0); doLoopHoisting = doValueNum && (JitConfig.JitDoLoopHoisting() != 0); doCopyProp = doValueNum && (JitConfig.JitDoCopyProp() != 0); doBranchOpt = doValueNum && (JitConfig.JitDoRedundantBranchOpts() != 0); @@ -5005,6 +5007,13 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl DoPhase(this, PHASE_OPTIMIZE_INDEX_CHECKS, &Compiler::rangeCheckPhase); } + if (doOptimizeIVs) + { + // Simplify and optimize induction variables used in natural loops + // + DoPhase(this, PHASE_OPTIMIZE_INDUCTION_VARIABLES, &Compiler::optInductionVariables); + } + if (doVNBasedDeadStoreRemoval) { // Note: this invalidates SSA and value numbers on tree nodes. @@ -9409,6 +9418,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma comment(linker, "/include:cLoops") #pragma comment(linker, "/include:cLoopsA") #pragma comment(linker, "/include:cLoop") +#pragma comment(linker, "/include:cScev") #pragma comment(linker, "/include:cTreeFlags") #pragma comment(linker, "/include:cVN") @@ -9434,6 +9444,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #pragma comment(linker, "/include:dCVarSet") #pragma comment(linker, "/include:dLoop") #pragma comment(linker, "/include:dLoops") +#pragma comment(linker, "/include:dScev") #pragma comment(linker, "/include:dTreeFlags") #pragma comment(linker, "/include:dVN") @@ -9677,24 +9688,38 @@ JITDBGAPI void __cdecl cCVarSet(Compiler* comp, VARSET_VALARG_TP vars) JITDBGAPI void __cdecl cLoops(Compiler* comp) { static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called - printf("===================================================================== *NewLoops %u\n", sequenceNumber++); + printf("===================================================================== *Loops %u\n", sequenceNumber++); FlowGraphNaturalLoops::Dump(comp->m_loops); } JITDBGAPI void __cdecl cLoopsA(Compiler* comp, FlowGraphNaturalLoops* loops) { static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called - printf("===================================================================== *NewLoopsA %u\n", sequenceNumber++); + printf("===================================================================== *LoopsA %u\n", sequenceNumber++); FlowGraphNaturalLoops::Dump(loops); } JITDBGAPI void __cdecl cLoop(Compiler* comp, FlowGraphNaturalLoop* loop) { static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called - printf("===================================================================== *NewLoop %u\n", sequenceNumber++); + printf("===================================================================== *Loop %u\n", sequenceNumber++); FlowGraphNaturalLoop::Dump(loop); } +JITDBGAPI void __cdecl cScev(Compiler* comp, Scev* scev) +{ + static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called + printf("===================================================================== *Scev %u\n", sequenceNumber++); + if (scev == nullptr) + { + printf(" NULL\n"); + } + else + { + scev->Dump(comp); + } +} + JITDBGAPI void __cdecl cTreeFlags(Compiler* comp, GenTree* tree) { static unsigned sequenceNumber = 0; // separate calls with a number to indicate this function has been called @@ -10285,6 +10310,11 @@ JITDBGAPI void __cdecl dLoop(FlowGraphNaturalLoop* loop) cLoop(JitTls::GetCompiler(), loop); } +JITDBGAPI void __cdecl dScev(Scev* scev) +{ + cScev(JitTls::GetCompiler(), scev); +} + JITDBGAPI void __cdecl dTreeFlags(GenTree* tree) { cTreeFlags(JitTls::GetCompiler(), tree); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index cbe7fb95046a3d..f3712bebf21d09 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -42,6 +42,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "jitexpandarray.h" #include "tinyarray.h" #include "valuenum.h" +#include "scev.h" #include "namedintrinsiclist.h" #ifdef LATE_DISASM #include "disasm.h" @@ -4972,7 +4973,7 @@ class Compiler #ifdef DEBUG jitstd::vector* fgBBOrder; // ordered vector of BBs #endif - // Used as a quick check for whether loop alignment should look for natural loops. + // Used as a quick check for whether phases downstream of loop finding should look for natural loops. // If true: there may or may not be any natural loops in the flow graph, so try to find them // If false: there's definitely not any natural loops in the flow graph bool fgMightHaveNaturalLoops; @@ -7411,6 +7412,18 @@ class Compiler BasicBlock* basicBlock); #endif + PhaseStatus optInductionVariables(); + bool optCanSinkWidenedIV(unsigned lclNum, FlowGraphNaturalLoop* loop); + bool optIsIVWideningProfitable(unsigned lclNum, + BasicBlock* initBlock, + bool initedToConstant, + FlowGraphNaturalLoop* loop, + ArrayStack& ivUses); + void optBestEffortReplaceNarrowIVUses( + unsigned lclNum, unsigned ssaNum, unsigned newLclNum, BasicBlock* block, Statement* firstStmt); + void optReplaceWidenedIV(unsigned lclNum, unsigned ssaNum, unsigned newLclNum, Statement* stmt); + void optSinkWidenedIV(unsigned lclNum, unsigned newLclNum, FlowGraphNaturalLoop* loop); + // Redundant branch opts // PhaseStatus optRedundantBranches(); diff --git a/src/coreclr/jit/compmemkind.h b/src/coreclr/jit/compmemkind.h index 835d85f798d29b..e986682894c3b6 100644 --- a/src/coreclr/jit/compmemkind.h +++ b/src/coreclr/jit/compmemkind.h @@ -50,6 +50,7 @@ CompMemKindMacro(LoopOpt) CompMemKindMacro(LoopClone) CompMemKindMacro(LoopUnroll) CompMemKindMacro(LoopHoist) +CompMemKindMacro(LoopIVOpts) CompMemKindMacro(Unknown) CompMemKindMacro(RangeCheck) CompMemKindMacro(CopyProp) diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index 23930985319769..10b60167be4224 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -84,6 +84,7 @@ CompPhaseNameMacro(PHASE_BUILD_SSA_DF, "SSA: DF", CompPhaseNameMacro(PHASE_BUILD_SSA_INSERT_PHIS, "SSA: insert phis", false, PHASE_BUILD_SSA, false) CompPhaseNameMacro(PHASE_BUILD_SSA_RENAME, "SSA: rename", false, PHASE_BUILD_SSA, false) CompPhaseNameMacro(PHASE_EARLY_PROP, "Early Value Propagation", false, -1, false) +CompPhaseNameMacro(PHASE_OPTIMIZE_INDUCTION_VARIABLES, "Optimize Induction Variables", false, -1, false) CompPhaseNameMacro(PHASE_VALUE_NUMBER, "Do value numbering", false, -1, false) CompPhaseNameMacro(PHASE_OPTIMIZE_INDEX_CHECKS, "Optimize index checks", false, -1, false) CompPhaseNameMacro(PHASE_OPTIMIZE_VALNUM_CSES, "Optimize Valnum CSEs", false, -1, false) diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp new file mode 100644 index 00000000000000..d30202680976e0 --- /dev/null +++ b/src/coreclr/jit/inductionvariableopts.cpp @@ -0,0 +1,676 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// This file contains code to optimize induction variables in loops based on +// scalar evolution analysis (see scev.h and scev.cpp for more information +// about the scalar evolution analysis). +// +// Currently the only optimization done is widening of primary induction +// variables from 32 bits into 64 bits. This is generally only profitable on +// x64 that does not allow zero extension of 32-bit values in addressing modes +// (in contrast, arm64 does have the capability of including zero extensions in +// addressing modes). For x64 this saves a zero extension for every array +// access inside the loop, in exchange for some widening or narrowing stores +// outside the loop: +// - To make sure the new widened IV starts at the right value it is +// initialized to the value of the narrow IV outside the loop (either in the +// preheader or at the def location of the narrow IV). Usually the start +// value is a constant, in which case the widened IV is just initialized to +// the constant value. +// - If the narrow IV is used after the loop we need to store it back from +// the widened IV in the exits. We depend on liveness sets to figure out +// which exits to insert IR into. +// +// These steps ensure that the wide IV has the right value to begin with and +// the old narrow IV still has the right value after the loop. Additionally, +// we must replace every use of the narrow IV inside the loop with the widened +// IV. This is done by a traversal of the IR inside the loop. We do not +// actually widen the uses of the IV; rather, we keep all uses and defs as +// 32-bit, which the backend is able to handle efficiently on x64. Because of +// this we do not need to worry about overflow. +// + +#include "jitpch.h" +#include "scev.h" + +//------------------------------------------------------------------------ +// optCanSinkWidenedIV: Check to see if we are able to sink a store to the old +// local into the exits of a loop if we decide to widen. +// +// Parameters: +// lclNum - The primary induction variable +// loop - The loop +// +// Returns: +// True if we can sink a store to the old local after widening. +// +// Remarks: +// This handles the situation where the primary induction variable is used +// after the loop. In those cases we need to store the widened local back +// into the old one in the exits where the IV variable is live. +// +// We are able to sink when none of the exits are critical blocks, in the +// sense that all their predecessors must come from inside the loop. Loop +// exit canonicalization guarantees this for regular exit blocks. It is not +// guaranteed for exceptional exits, but we do not expect to widen IVs that +// are live into exceptional exits since those are marked DNER which makes it +// unprofitable anyway. +// +// Note that there may be natural loops that have not had their regular exits +// canonicalized at the time when IV opts run, in particular if RBO/assertion +// prop makes a previously unnatural loop natural. This function accounts for +// and rejects these cases. +// +bool Compiler::optCanSinkWidenedIV(unsigned lclNum, FlowGraphNaturalLoop* loop) +{ + LclVarDsc* dsc = lvaGetDesc(lclNum); + + BasicBlockVisit result = loop->VisitRegularExitBlocks([=](BasicBlock* exit) { + + if (!VarSetOps::IsMember(this, exit->bbLiveIn, dsc->lvVarIndex)) + { + JITDUMP(" Exit " FMT_BB " does not need a sink; V%02u is not live-in\n", exit->bbNum, lclNum); + return BasicBlockVisit::Continue; + } + + for (BasicBlock* pred : exit->PredBlocks()) + { + if (!loop->ContainsBlock(pred)) + { + JITDUMP(" Cannot safely sink widened version of V%02u into exit " FMT_BB " of " FMT_LP + "; it has a non-loop pred " FMT_BB "\n", + lclNum, exit->bbNum, loop->GetIndex(), pred->bbNum); + return BasicBlockVisit::Abort; + } + } + + return BasicBlockVisit::Continue; + }); + +#ifdef DEBUG + // We currently do not expect to ever widen IVs that are live into + // exceptional exits. Such IVs are expected to have been marked DNER + // previously (EH write-thru is only for single def locals) which makes it + // unprofitable. If this ever changes we need some more expansive handling + // here. + loop->VisitLoopBlocks([=](BasicBlock* block) { + + block->VisitAllSuccs(this, [=](BasicBlock* succ) { + if (!loop->ContainsBlock(succ) && bbIsHandlerBeg(succ)) + { + assert(!VarSetOps::IsMember(this, succ->bbLiveIn, dsc->lvVarIndex) && + "Candidate IV for widening is live into exceptional exit"); + } + + return BasicBlockVisit::Continue; + }); + + return BasicBlockVisit::Continue; + }); +#endif + + return result != BasicBlockVisit::Abort; +} + +//------------------------------------------------------------------------ +// optIsIVWideningProfitable: Check to see if IV widening is profitable. +// +// Parameters: +// lclNum - The primary induction variable +// initBlock - The block in where the new IV would be initialized +// initedToConstant - Whether or not the new IV will be initialized to a constant +// loop - The loop +// ivUses - Statements in which "lclNum" appears will be added to this list +// +// +// Returns: +// True if IV widening is profitable. +// +// Remarks: +// IV widening is generally profitable when it allows us to remove casts +// inside the loop. However, it may also introduce other reg-reg moves: +// 1. We may need to store the narrow IV into the wide one in the +// preheader. This is necessary when the start value is not constant. If +// the start value _is_ constant then we assume that the constant store to +// the narrow local will be a DCE'd. +// 2. We need to store the wide IV back into the narrow one in each of +// the exits where the narrow IV is live-in. +// +bool Compiler::optIsIVWideningProfitable(unsigned lclNum, + BasicBlock* initBlock, + bool initedToConstant, + FlowGraphNaturalLoop* loop, + ArrayStack& ivUses) +{ + for (FlowGraphNaturalLoop* otherLoop : m_loops->InReversePostOrder()) + { + if (otherLoop == loop) + continue; + + for (Statement* stmt : otherLoop->GetHeader()->Statements()) + { + if (!stmt->IsPhiDefnStmt()) + break; + + if (stmt->GetRootNode()->AsLclVarCommon()->GetLclNum() == lclNum) + { + JITDUMP(" V%02u has a phi [%06u] in " FMT_LP "'s header " FMT_BB "\n", lclNum, + dspTreeID(stmt->GetRootNode()), otherLoop->GetIndex(), otherLoop->GetHeader()->bbNum); + // TODO-CQ: We can legally widen these cases, but LSRA is + // unhappy about some of the lifetimes we create when we do + // this. This particularly affects cloned loops. + return false; + } + } + } + + const weight_t ExtensionCost = 2; + const int ExtensionSize = 3; + + weight_t savedCost = 0; + int savedSize = 0; + + loop->VisitLoopBlocks([&](BasicBlock* block) { + for (Statement* stmt : block->NonPhiStatements()) + { + bool hasUse = false; + int numExtensions = 0; + for (GenTree* node : stmt->TreeList()) + { + if (!node->OperIs(GT_CAST)) + { + hasUse |= node->OperIsLocal() && (node->AsLclVarCommon()->GetLclNum() == lclNum); + continue; + } + + GenTreeCast* cast = node->AsCast(); + if ((cast->gtCastType != TYP_LONG) || !cast->IsUnsigned() || cast->gtOverflow()) + { + continue; + } + + GenTree* op = cast->CastOp(); + if (!op->OperIs(GT_LCL_VAR) || (op->AsLclVarCommon()->GetLclNum() != lclNum)) + { + continue; + } + + // If this is already the source of a store then it is going to be + // free in our backends regardless. + GenTree* parent = node->gtGetParent(nullptr); + if ((parent != nullptr) && parent->OperIs(GT_STORE_LCL_VAR)) + { + continue; + } + + numExtensions++; + } + + if (hasUse) + { + ivUses.Push(stmt); + } + + if (numExtensions > 0) + { + JITDUMP(" Found %d zero extensions in " FMT_STMT "\n", numExtensions, stmt->GetID()); + + savedSize += numExtensions * ExtensionSize; + savedCost += numExtensions * block->getBBWeight(this) * ExtensionCost; + } + } + + return BasicBlockVisit::Continue; + }); + + if (!initedToConstant) + { + // We will need to store the narrow IV into the wide one in the init + // block. We only cost this when init value is not a constant since + // otherwise we assume that constant initialization of the narrow local + // will be DCE'd. + savedSize -= ExtensionSize; + savedCost -= initBlock->getBBWeight(this) * ExtensionCost; + } + + // Now account for the cost of sinks. + LclVarDsc* dsc = lvaGetDesc(lclNum); + loop->VisitRegularExitBlocks([&](BasicBlock* exit) { + if (VarSetOps::IsMember(this, exit->bbLiveIn, dsc->lvVarIndex)) + { + savedSize -= ExtensionSize; + savedCost -= exit->getBBWeight(this) * ExtensionCost; + } + return BasicBlockVisit::Continue; + }); + + const weight_t ALLOWED_SIZE_REGRESSION_PER_CYCLE_IMPROVEMENT = 2; + weight_t cycleImprovementPerInvoc = savedCost / fgFirstBB->getBBWeight(this); + + JITDUMP(" Estimated cycle improvement: " FMT_WT " cycles per invocation\n", cycleImprovementPerInvoc); + JITDUMP(" Estimated size improvement: %d bytes\n", savedSize); + + if ((cycleImprovementPerInvoc > 0) && + ((cycleImprovementPerInvoc * ALLOWED_SIZE_REGRESSION_PER_CYCLE_IMPROVEMENT) >= -savedSize)) + { + JITDUMP(" Widening is profitable (cycle improvement)\n"); + return true; + } + + const weight_t ALLOWED_CYCLE_REGRESSION_PER_SIZE_IMPROVEMENT = 0.01; + + if ((savedSize > 0) && ((savedSize * ALLOWED_CYCLE_REGRESSION_PER_SIZE_IMPROVEMENT) >= -cycleImprovementPerInvoc)) + { + JITDUMP(" Widening is profitable (size improvement)\n"); + return true; + } + + JITDUMP(" Widening is not profitable\n"); + return false; +} + +//------------------------------------------------------------------------ +// optSinkWidenedIV: Create stores back to the narrow IV in the exits where +// that is necessary. +// +// Parameters: +// lclNum - Narrow version of primary induction variable +// newLclNum - Wide version of primary induction variable +// loop - The loop +// +// Returns: +// True if any store was created in any exit block. +// +void Compiler::optSinkWidenedIV(unsigned lclNum, unsigned newLclNum, FlowGraphNaturalLoop* loop) +{ + LclVarDsc* dsc = lvaGetDesc(lclNum); + loop->VisitRegularExitBlocks([=](BasicBlock* exit) { + if (!VarSetOps::IsMember(this, exit->bbLiveIn, dsc->lvVarIndex)) + { + return BasicBlockVisit::Continue; + } + + GenTree* narrowing = gtNewCastNode(TYP_INT, gtNewLclvNode(newLclNum, TYP_LONG), false, TYP_INT); + GenTree* store = gtNewStoreLclVarNode(lclNum, narrowing); + Statement* newStmt = fgNewStmtFromTree(store); + JITDUMP("Narrow IV local V%02u live into exit block " FMT_BB "; sinking a narrowing\n", lclNum, exit->bbNum); + DISPSTMT(newStmt); + fgInsertStmtAtBeg(exit, newStmt); + + return BasicBlockVisit::Continue; + }); +} + +//------------------------------------------------------------------------ +// optReplaceWidenedIV: Replace uses of the narrow IV with the wide IV in the +// specified statement. +// +// Parameters: +// lclNum - Narrow version of primary induction variable +// newLclNum - Wide version of primary induction variable +// stmt - The statement to replace uses in. +// +void Compiler::optReplaceWidenedIV(unsigned lclNum, unsigned ssaNum, unsigned newLclNum, Statement* stmt) +{ + struct ReplaceVisitor : GenTreeVisitor + { + private: + unsigned m_lclNum; + unsigned m_ssaNum; + unsigned m_newLclNum; + + bool IsLocal(GenTreeLclVarCommon* tree) + { + return (tree->GetLclNum() == m_lclNum) && + ((m_ssaNum == SsaConfig::RESERVED_SSA_NUM) || (tree->GetSsaNum() == m_ssaNum)); + } + + public: + bool MadeChanges = false; + + enum + { + DoPreOrder = true, + }; + + ReplaceVisitor(Compiler* comp, unsigned lclNum, unsigned ssaNum, unsigned newLclNum) + : GenTreeVisitor(comp), m_lclNum(lclNum), m_ssaNum(ssaNum), m_newLclNum(newLclNum) + { + } + + fgWalkResult PreOrderVisit(GenTree** use, GenTree* user) + { + GenTree* node = *use; + if (node->OperIs(GT_CAST)) + { + GenTreeCast* cast = node->AsCast(); + if ((cast->gtCastType == TYP_LONG) && cast->IsUnsigned() && !cast->gtOverflow()) + { + GenTree* op = cast->CastOp(); + if (op->OperIs(GT_LCL_VAR) && IsLocal(op->AsLclVarCommon())) + { + *use = m_compiler->gtNewLclvNode(m_newLclNum, TYP_LONG); + MadeChanges = true; + return fgWalkResult::WALK_SKIP_SUBTREES; + } + } + } + else if (node->OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR, GT_LCL_FLD, GT_STORE_LCL_FLD) && + IsLocal(node->AsLclVarCommon())) + { + switch (node->OperGet()) + { + case GT_LCL_VAR: + node->AsLclVarCommon()->SetLclNum(m_newLclNum); + // No cast needed -- the backend allows TYP_INT uses of TYP_LONG locals. + break; + case GT_STORE_LCL_VAR: + { + node->AsLclVarCommon()->SetLclNum(m_newLclNum); + node->gtType = TYP_LONG; + node->AsLclVarCommon()->Data() = + m_compiler->gtNewCastNode(TYP_LONG, node->AsLclVarCommon()->Data(), true, TYP_LONG); + break; + } + case GT_LCL_FLD: + case GT_STORE_LCL_FLD: + assert(!"Unexpected field use for local not marked as DNER"); + break; + default: + break; + } + + MadeChanges = true; + } + + return fgWalkResult::WALK_CONTINUE; + } + }; + + ReplaceVisitor visitor(this, lclNum, ssaNum, newLclNum); + visitor.WalkTree(stmt->GetRootNodePointer(), nullptr); + if (visitor.MadeChanges) + { + gtSetStmtInfo(stmt); + fgSetStmtSeq(stmt); + JITDUMP("New tree:\n", dspTreeID(stmt->GetRootNode())); + DISPTREE(stmt->GetRootNode()); + JITDUMP("\n"); + } + else + { + JITDUMP("No replacements made\n"); + } +} + +//------------------------------------------------------------------------ +// optBestEffortReplaceNarrowIVUses: Try to find and replace uses of the specified +// SSA def with a new local. +// +// Parameters: +// lclNum - Previous local +// ssaNum - Previous local SSA num +// newLclNum - New local to replace with +// block - Block to replace in +// firstStmt - First statement in "block" to start replacing in +// +// Remarks: +// This function is best effort; it might not find all uses of the provided +// SSA num, particularly because it does not follow into joins. Note that we +// only use this to replace uses of the narrow IV outside the loop; inside +// the loop we do ensure that all uses/defs are replaced. +// Keeping it best-effort outside the loop is ok; there is no correctness +// issue since we do not invalidate the value of the old narrow IV in any +// way, but it may mean we end up leaving the narrow IV live concurrently +// with the new widened IV, increasing register pressure. +// +void Compiler::optBestEffortReplaceNarrowIVUses( + unsigned lclNum, unsigned ssaNum, unsigned newLclNum, BasicBlock* block, Statement* firstStmt) +{ + JITDUMP("Replacing V%02u -> V%02u in " FMT_BB " starting at " FMT_STMT "\n", lclNum, newLclNum, block->bbNum, + firstStmt == nullptr ? 0 : firstStmt->GetID()); + + for (Statement* stmt = firstStmt; stmt != nullptr; stmt = stmt->GetNextStmt()) + { + JITDUMP("Replacing V%02u -> V%02u in [%06u]\n", lclNum, newLclNum, dspTreeID(stmt->GetRootNode())); + DISPSTMT(stmt); + JITDUMP("\n"); + + optReplaceWidenedIV(lclNum, ssaNum, newLclNum, stmt); + } + + block->VisitRegularSuccs(this, [=](BasicBlock* succ) { + if (succ->GetUniquePred(this) == block) + { + optBestEffortReplaceNarrowIVUses(lclNum, ssaNum, newLclNum, succ, succ->firstStmt()); + } + + return BasicBlockVisit::Continue; + }); +} + +//------------------------------------------------------------------------ +// optInductionVariables: Try and optimize induction variables in the method. +// +// Returns: +// PhaseStatus indicating if anything changed. +// +PhaseStatus Compiler::optInductionVariables() +{ + JITDUMP("*************** In optInductionVariables()\n"); + +#ifdef DEBUG + static ConfigMethodRange s_range; + s_range.EnsureInit(JitConfig.JitEnableInductionVariableOptsRange()); + + if (!s_range.Contains(info.compMethodHash())) + { + return PhaseStatus::MODIFIED_NOTHING; + } +#endif + + if (!fgMightHaveNaturalLoops) + { + JITDUMP(" Skipping since this method has no natural loops\n"); + return PhaseStatus::MODIFIED_NOTHING; + } + + bool changed = false; + + // Currently we only do IV widening which generally is only profitable for + // x64 because arm64 addressing modes can include the zero/sign-extension + // of the index for free. + CLANG_FORMAT_COMMENT_ANCHOR; +#if defined(TARGET_XARCH) && defined(TARGET_64BIT) + m_dfsTree = fgComputeDfs(); + m_loops = FlowGraphNaturalLoops::Find(m_dfsTree); + + ScalarEvolutionContext scevContext(this); + JITDUMP("Widening primary induction variables:\n"); + ArrayStack ivUses(getAllocator(CMK_LoopIVOpts)); + for (FlowGraphNaturalLoop* loop : m_loops->InReversePostOrder()) + { + JITDUMP("Processing "); + DBEXEC(verbose, FlowGraphNaturalLoop::Dump(loop)); + scevContext.ResetForLoop(loop); + + for (Statement* stmt : loop->GetHeader()->Statements()) + { + if (!stmt->IsPhiDefnStmt()) + { + break; + } + + JITDUMP("\n"); + + DISPSTMT(stmt); + + GenTreeLclVarCommon* lcl = stmt->GetRootNode()->AsLclVarCommon(); + LclVarDsc* lclDsc = lvaGetDesc(lcl); + if (lclDsc->TypeGet() != TYP_INT) + { + JITDUMP(" Type is %s, no widening to be done\n", varTypeName(lclDsc->TypeGet())); + continue; + } + + // If the IV is not enregisterable then uses/defs are going to go + // to stack regardless. This check also filters out IVs that may be + // live into exceptional exits since those are always marked DNER. + if (lclDsc->lvDoNotEnregister) + { + JITDUMP(" V%02u is marked DNER\n", lcl->GetLclNum()); + continue; + } + + Scev* scev = scevContext.Analyze(loop->GetHeader(), stmt->GetRootNode()); + if (scev == nullptr) + { + JITDUMP(" Could not analyze header PHI\n"); + continue; + } + + scev = scevContext.Simplify(scev); + JITDUMP(" => "); + DBEXEC(verbose, scev->Dump(this)); + JITDUMP("\n"); + if (!scev->OperIs(ScevOper::AddRec)) + { + JITDUMP(" Not an addrec\n"); + continue; + } + + ScevAddRec* addRec = (ScevAddRec*)scev; + + JITDUMP(" V%02u is a primary induction variable in " FMT_LP "\n", lcl->GetLclNum(), loop->GetIndex()); + + if (!optCanSinkWidenedIV(lcl->GetLclNum(), loop)) + { + continue; + } + + // Start value should always be an SSA use from outside the loop + // since we only widen primary IVs. + assert(addRec->Start->OperIs(ScevOper::Local)); + ScevLocal* startLocal = (ScevLocal*)addRec->Start; + int64_t startConstant = 0; + bool initToConstant = startLocal->GetConstantValue(this, &startConstant); + LclSsaVarDsc* startSsaDsc = lclDsc->GetPerSsaData(startLocal->SsaNum); + + BasicBlock* preheader = loop->EntryEdge(0)->getSourceBlock(); + BasicBlock* initBlock = preheader; + if ((startSsaDsc->GetBlock() != nullptr) && (startSsaDsc->GetDefNode() != nullptr)) + { + initBlock = startSsaDsc->GetBlock(); + } + + ivUses.Reset(); + if (!optIsIVWideningProfitable(lcl->GetLclNum(), initBlock, initToConstant, loop, ivUses)) + { + continue; + } + + changed = true; + + Statement* insertInitAfter = nullptr; + if (initBlock != preheader) + { + GenTree* narrowInitRoot = startSsaDsc->GetDefNode(); + while (true) + { + GenTree* parent = narrowInitRoot->gtGetParent(nullptr); + if (parent == nullptr) + break; + + narrowInitRoot = parent; + } + + for (Statement* stmt : initBlock->Statements()) + { + if (stmt->GetRootNode() == narrowInitRoot) + { + insertInitAfter = stmt; + break; + } + } + + assert(insertInitAfter != nullptr); + + if (insertInitAfter->IsPhiDefnStmt()) + { + while ((insertInitAfter->GetNextStmt() != nullptr) && + insertInitAfter->GetNextStmt()->IsPhiDefnStmt()) + { + insertInitAfter = insertInitAfter->GetNextStmt(); + } + } + } + + Statement* initStmt = nullptr; + unsigned newLclNum = lvaGrabTemp(false DEBUGARG(printfAlloc("Widened IV V%02u", lcl->GetLclNum()))); + INDEBUG(lclDsc = nullptr); + assert(startLocal->LclNum == lcl->GetLclNum()); + + if (initBlock != preheader) + { + JITDUMP("Adding initialization of new widened local to same block as reaching def outside loop, " FMT_BB + "\n", + initBlock->bbNum); + } + else + { + JITDUMP("Adding initialization of new widened local to preheader " FMT_BB "\n", initBlock->bbNum); + } + + GenTree* initVal; + if (initToConstant) + { + initVal = gtNewIconNode((int64_t)(uint32_t)startConstant, TYP_LONG); + } + else + { + initVal = gtNewCastNode(TYP_LONG, gtNewLclvNode(lcl->GetLclNum(), TYP_INT), true, TYP_LONG); + } + + GenTree* widenStore = gtNewTempStore(newLclNum, initVal); + initStmt = fgNewStmtFromTree(widenStore); + if (insertInitAfter != nullptr) + { + fgInsertStmtAfter(initBlock, insertInitAfter, initStmt); + } + else + { + fgInsertStmtNearEnd(initBlock, initStmt); + } + + DISPSTMT(initStmt); + JITDUMP("\n"); + + JITDUMP(" Replacing uses of V%02u with widened version V%02u\n", lcl->GetLclNum(), newLclNum); + + if (initStmt != nullptr) + { + JITDUMP(" Replacing on the way to the loop\n"); + optBestEffortReplaceNarrowIVUses(lcl->GetLclNum(), startLocal->SsaNum, newLclNum, initBlock, + initStmt->GetNextStmt()); + } + + JITDUMP(" Replacing in the loop; %d statements with appearences\n", ivUses.Height()); + for (int i = 0; i < ivUses.Height(); i++) + { + Statement* stmt = ivUses.Bottom(i); + JITDUMP("Replacing V%02u -> V%02u in [%06u]\n", lcl->GetLclNum(), newLclNum, + dspTreeID(stmt->GetRootNode())); + DISPSTMT(stmt); + JITDUMP("\n"); + optReplaceWidenedIV(lcl->GetLclNum(), SsaConfig::RESERVED_SSA_NUM, newLclNum, stmt); + } + + optSinkWidenedIV(lcl->GetLclNum(), newLclNum, loop); + } + } + + fgInvalidateDfsTree(); +#endif + + return changed ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; +} diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index abc510d967a80d..7c6e95bbb88a3e 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -480,8 +480,9 @@ CONFIG_INTEGER(JitNoRngChks, W("JitNoRngChks"), 0) // If 1, don't generate range #if defined(OPT_CONFIG) CONFIG_INTEGER(JitDoAssertionProp, W("JitDoAssertionProp"), 1) // Perform assertion propagation optimization -CONFIG_INTEGER(JitDoCopyProp, W("JitDoCopyProp"), 1) // Perform copy propagation on variables that appear redundant -CONFIG_INTEGER(JitDoEarlyProp, W("JitDoEarlyProp"), 1) // Perform Early Value Propagation +CONFIG_INTEGER(JitDoCopyProp, W("JitDoCopyProp"), 1) // Perform copy propagation on variables that appear redundant +CONFIG_INTEGER(JitDoOptimizeIVs, W("JitDoOptimizeIVs"), 1) // Perform optimization of induction variables +CONFIG_INTEGER(JitDoEarlyProp, W("JitDoEarlyProp"), 1) // Perform Early Value Propagation CONFIG_INTEGER(JitDoLoopHoisting, W("JitDoLoopHoisting"), 1) // Perform loop hoisting on loop invariant values CONFIG_INTEGER(JitDoLoopInversion, W("JitDoLoopInversion"), 1) // Perform loop inversion on "for/while" loops CONFIG_INTEGER(JitDoRangeAnalysis, W("JitDoRangeAnalysis"), 1) // Perform range check analysis @@ -496,6 +497,7 @@ CONFIG_STRING(JitOnlyOptimizeRange, W("JitOnlyOptimizeRange")) // If set, all methods that do _not_ match are forced into MinOpts CONFIG_STRING(JitEnablePhysicalPromotionRange, W("JitEnablePhysicalPromotionRange")) CONFIG_STRING(JitEnableCrossBlockLocalAssertionPropRange, W("JitEnableCrossBlockLocalAssertionPropRange")) +CONFIG_STRING(JitEnableInductionVariableOptsRange, W("JitEnableInductionVariableOptsRange")) CONFIG_INTEGER(JitDoSsa, W("JitDoSsa"), 1) // Perform Static Single Assignment (SSA) numbering on the variables CONFIG_INTEGER(JitDoValueNumber, W("JitDoValueNumber"), 1) // Perform value numbering on method expressions diff --git a/src/coreclr/jit/scev.cpp b/src/coreclr/jit/scev.cpp new file mode 100644 index 00000000000000..81760593a8aba8 --- /dev/null +++ b/src/coreclr/jit/scev.cpp @@ -0,0 +1,821 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// This file contains code to analyze how the value of induction variables +// evolve (scalar evolution analysis), and to turn them into the SCEV IR +// defined in scev.h. The analysis is inspired by "Michael Wolfe. 1992. Beyond +// induction variables." and also by LLVM's scalar evolution analysis. +// +// The main idea of scalar evolution nalysis is to give a closed form +// describing the value of tree nodes inside loops even when taking into +// account that they are changing on each loop iteration. This is useful for +// optimizations that want to reason about values of IR nodes inside loops, +// such as IV widening or strength reduction. +// +// To represent the possibility of evolution the SCEV IR includes the concept +// of an add recurrence , which describes a value that +// starts at "start" and changes by adding "step" at each iteration. The IR +// nodes that change in this way (or depend on something that changes in this +// way) are generally called induction variables. +// +// An add recurrence arises only when a local exists in the loop that is +// mutated in each iteration. Such a local will naturally end up with a phi +// node in the loop header. These locals are called primary (or basic) +// induction variables. The non-primary IVs (which always must depend on the +// primary IVs) are sometimes called secondary IVs. +// +// The job of the analysis is to go from a tree node to a SCEV node that +// describes its value (possibly taking its evolution into account). Note that +// SCEV nodes are immutable and the values they represent are _not_ +// flow-dependent; that is, they don't exist at a specific location inside the +// loop, even though some particular tree node gave rise to that SCEV node. The +// analysis itself _is_ flow-dependent and guarantees that the Scev* returned +// describes the value that corresponds to what the tree node computes at its +// specific location. However, it would be perfectly legal for two trees at +// different locations in the loop to analyze to the same SCEV node (even +// potentially returning the same pointer). For example, in theory "i" and "j" +// in the following loop would both be represented by the same add recurrence +// , and the analysis could even return the same Scev* for both of +// them, even if it does not today: +// +// int i = 0; +// while (true) +// { +// i++; +// ... +// int j = i - 1; +// } +// +// Actually materializing the value of a SCEV node back into tree IR is not +// implemented yet, but generally would depend on the availability of tree +// nodes that compute the dependent values at the point where the IR is to be +// materialized. +// +// Besides the add recurrences the analysis itself is generally a +// straightforward translation from JIT IR into the SCEV IR. Creating the add +// recurrences requires paying attention to the structure of PHIs, and +// disambiguating the values coming from outside the loop and the values coming +// from the backedges. Currently only simplistic add recurrences that do not +// require recursive analysis are supported. These simplistic add recurrences +// are always on the form i = i + k. +// + +#include "jitpch.h" + +//------------------------------------------------------------------------ +// GetConstantValue: If this SSA use refers to a constant, then fetch that +// constant. +// +// Parameters: +// comp - Compiler instance +// cns - [out] Constant value; only valid if this function returns true. +// +// Returns: +// True if this SSA use refers to a constant; otherwise false, +// +bool ScevLocal::GetConstantValue(Compiler* comp, int64_t* cns) +{ + LclVarDsc* dsc = comp->lvaGetDesc(LclNum); + LclSsaVarDsc* ssaDsc = dsc->GetPerSsaData(SsaNum); + GenTreeLclVarCommon* defNode = ssaDsc->GetDefNode(); + if ((defNode != nullptr) && defNode->Data()->OperIs(GT_CNS_INT, GT_CNS_LNG)) + { + *cns = defNode->Data()->AsIntConCommon()->IntegralValue(); + return true; + } + + return false; +} + +//------------------------------------------------------------------------ +// Scev::GetConstantValue: If this SCEV is always a constant (i.e. either an +// inline constant or an SSA use referring to a constant) then obtain that +// constant. +// +// Parameters: +// comp - Compiler instance +// cns - [out] Constant value; only valid if this function returns true. +// +// Returns: +// True if a constant could be extracted. +// +bool Scev::GetConstantValue(Compiler* comp, int64_t* cns) +{ + if (OperIs(ScevOper::Constant)) + { + *cns = ((ScevConstant*)this)->Value; + return true; + } + + if (OperIs(ScevOper::Local)) + { + return ((ScevLocal*)this)->GetConstantValue(comp, cns); + } + + return false; +} + +#ifdef DEBUG +//------------------------------------------------------------------------ +// Dump: Print this scev node to stdout. +// +// Parameters: +// comp - Compiler instance +// +void Scev::Dump(Compiler* comp) +{ + switch (Oper) + { + case ScevOper::Constant: + { + ScevConstant* cns = (ScevConstant*)this; + printf("%zd", (ssize_t)cns->Value); + break; + } + case ScevOper::Local: + { + ScevLocal* invariantLocal = (ScevLocal*)this; + printf("V%02u.%u", invariantLocal->LclNum, invariantLocal->SsaNum); + + int64_t cns; + if (invariantLocal->GetConstantValue(comp, &cns)) + { + printf(" (%lld)", (long long)cns); + } + break; + } + case ScevOper::ZeroExtend: + case ScevOper::SignExtend: + { + ScevUnop* unop = (ScevUnop*)this; + printf("%cext<%d>(", unop->Oper == ScevOper::ZeroExtend ? 'z' : 's', genTypeSize(unop->Type) * 8); + unop->Op1->Dump(comp); + printf(")"); + break; + } + case ScevOper::Add: + case ScevOper::Mul: + case ScevOper::Lsh: + { + ScevBinop* binop = (ScevBinop*)this; + printf("("); + binop->Op1->Dump(comp); + const char* op; + switch (binop->Oper) + { + case ScevOper::Add: + op = "+"; + break; + case ScevOper::Mul: + op = "*"; + break; + case ScevOper::Lsh: + op = "<<"; + break; + default: + unreached(); + } + printf(" %s ", op); + binop->Op2->Dump(comp); + printf(")"); + break; + } + case ScevOper::AddRec: + { + ScevAddRec* addRec = (ScevAddRec*)this; + printf("<" FMT_LP, addRec->Loop->GetIndex()); + printf(", "); + addRec->Start->Dump(comp); + printf(", "); + addRec->Step->Dump(comp); + printf(">"); + break; + } + default: + unreached(); + } +} +#endif + +//------------------------------------------------------------------------ +// ScalarEvolutionContext: Construct an instance of a context to do scalar evolution in. +// +// Parameters: +// comp - Compiler instance +// +// Remarks: +// After construction the context should be reset for a new loop by calling +// ResetForLoop. +// +ScalarEvolutionContext::ScalarEvolutionContext(Compiler* comp) + : m_comp(comp), m_cache(comp->getAllocator(CMK_LoopIVOpts)) +{ +} + +//------------------------------------------------------------------------ +// ResetForLoop: Reset the internal cache in preparation of scalar +// evolution analysis inside a new loop. +// +// Parameters: +// loop - The loop. +// +void ScalarEvolutionContext::ResetForLoop(FlowGraphNaturalLoop* loop) +{ + m_loop = loop; + m_cache.RemoveAll(); +} + +//------------------------------------------------------------------------ +// NewConstant: Create a SCEV node that represents a constant. +// +// Returns: +// The new node. +// +ScevConstant* ScalarEvolutionContext::NewConstant(var_types type, int64_t value) +{ + ScevConstant* constant = new (m_comp, CMK_LoopIVOpts) ScevConstant(type, value); + return constant; +} + +//------------------------------------------------------------------------ +// NewLocal: Create a SCEV node that represents an invariant local (i.e. a +// use of an SSA def from outside the loop). +// +// Parameters: +// lclNum - The local +// ssaNum - The SSA number of the def outside the loop that is being used. +// +// Returns: +// The new node. +// +ScevLocal* ScalarEvolutionContext::NewLocal(unsigned lclNum, unsigned ssaNum) +{ + var_types type = genActualType(m_comp->lvaGetDesc(lclNum)); + ScevLocal* invariantLocal = new (m_comp, CMK_LoopIVOpts) ScevLocal(type, lclNum, ssaNum); + return invariantLocal; +} + +//------------------------------------------------------------------------ +// NewExtension: Create a SCEV node that represents a zero or sign extension. +// +// Parameters: +// oper - The operation (ScevOper::ZeroExtend or ScevOper::SignExtend) +// targetType - The target type of the extension +// op - The operand being extended. +// +// Returns: +// The new node. +// +ScevUnop* ScalarEvolutionContext::NewExtension(ScevOper oper, var_types targetType, Scev* op) +{ + assert(op != nullptr); + ScevUnop* ext = new (m_comp, CMK_LoopIVOpts) ScevUnop(oper, targetType, op); + return ext; +} + +//------------------------------------------------------------------------ +// NewBinop: Create a SCEV node that represents a binary operation. +// +// Parameters: +// oper - The operation +// op1 - First operand +// op2 - Second operand +// +// Returns: +// The new node. +// +ScevBinop* ScalarEvolutionContext::NewBinop(ScevOper oper, Scev* op1, Scev* op2) +{ + assert((op1 != nullptr) && (op2 != nullptr)); + ScevBinop* binop = new (m_comp, CMK_LoopIVOpts) ScevBinop(oper, op1->Type, op1, op2); + return binop; +} + +//------------------------------------------------------------------------ +// NewAddRec: Create a SCEV node that represents a new add recurrence. +// +// Parameters: +// loop - The loop where this add recurrence is evolving +// start - Value of the recurrence at the first iteration +// step - Step value of the recurrence +// +// Returns: +// The new node. +// +ScevAddRec* ScalarEvolutionContext::NewAddRec(Scev* start, Scev* step) +{ + assert((start != nullptr) && (step != nullptr)); + ScevAddRec* addRec = new (m_comp, CMK_LoopIVOpts) ScevAddRec(start->Type, start, step DEBUGARG(m_loop)); + return addRec; +} + +//------------------------------------------------------------------------ +// CreateSimpleInvariantScev: Create a "simple invariant" SCEV node for a tree: +// either an invariant local use or a constant. +// +// Parameters: +// tree - The tree +// +// Returns: +// SCEV node or nullptr if the tree is not a simple invariant. +// +Scev* ScalarEvolutionContext::CreateSimpleInvariantScev(GenTree* tree) +{ + if (tree->OperIs(GT_CNS_INT, GT_CNS_LNG)) + { + return CreateScevForConstant(tree->AsIntConCommon()); + } + + if (tree->OperIs(GT_LCL_VAR) && tree->AsLclVarCommon()->HasSsaName()) + { + LclVarDsc* dsc = m_comp->lvaGetDesc(tree->AsLclVarCommon()); + LclSsaVarDsc* ssaDsc = dsc->GetPerSsaData(tree->AsLclVarCommon()->GetSsaNum()); + + if ((ssaDsc->GetBlock() == nullptr) || !m_loop->ContainsBlock(ssaDsc->GetBlock())) + { + return NewLocal(tree->AsLclVarCommon()->GetLclNum(), tree->AsLclVarCommon()->GetSsaNum()); + } + } + + return nullptr; +} + +//------------------------------------------------------------------------ +// CreateScevForConstant: Given an integer constant, create a SCEV node for it. +// +// Parameters: +// tree - The integer constant +// +// Returns: +// SCEV node or nullptr if the integer constant is not representable (e.g. a handle). +// +Scev* ScalarEvolutionContext::CreateScevForConstant(GenTreeIntConCommon* tree) +{ + if (tree->IsIconHandle() || !tree->TypeIs(TYP_INT, TYP_LONG)) + { + return nullptr; + } + + return NewConstant(tree->TypeGet(), tree->AsIntConCommon()->IntegralValue()); +} + +//------------------------------------------------------------------------ +// AnalyzeNew: Analyze the specified tree in the specified block, without going +// through the cache. +// +// Parameters: +// block - Block containing the tree +// tree - Tree node +// depth - Current analysis depth +// +// Returns: +// SCEV node if the tree was analyzable; otherwise nullptr if the value is +// cannot be described. +// +Scev* ScalarEvolutionContext::AnalyzeNew(BasicBlock* block, GenTree* tree, int depth) +{ + switch (tree->OperGet()) + { + case GT_CNS_INT: + case GT_CNS_LNG: + { + return CreateScevForConstant(tree->AsIntConCommon()); + } + case GT_LCL_VAR: + case GT_PHI_ARG: + { + if (!tree->AsLclVarCommon()->HasSsaName()) + { + return nullptr; + } + + assert(m_comp->lvaInSsa(tree->AsLclVarCommon()->GetLclNum())); + LclVarDsc* dsc = m_comp->lvaGetDesc(tree->AsLclVarCommon()); + LclSsaVarDsc* ssaDsc = dsc->GetPerSsaData(tree->AsLclVarCommon()->GetSsaNum()); + + if ((ssaDsc->GetBlock() == nullptr) || !m_loop->ContainsBlock(ssaDsc->GetBlock())) + { + return NewLocal(tree->AsLclVarCommon()->GetLclNum(), tree->AsLclVarCommon()->GetSsaNum()); + } + + if (ssaDsc->GetDefNode() == nullptr) + { + // GT_CALL retbuf def? + return nullptr; + } + + if (ssaDsc->GetDefNode()->GetLclNum() != tree->AsLclVarCommon()->GetLclNum()) + { + // Should be a def of the parent + assert(dsc->lvIsStructField && (ssaDsc->GetDefNode()->GetLclNum() == dsc->lvParentLcl)); + return nullptr; + } + + return Analyze(ssaDsc->GetBlock(), ssaDsc->GetDefNode(), depth + 1); + } + case GT_STORE_LCL_VAR: + { + GenTreeLclVarCommon* store = tree->AsLclVarCommon(); + GenTree* data = store->Data(); + if (!data->OperIs(GT_PHI)) + { + return Analyze(block, data, depth + 1); + } + + if (block != m_loop->GetHeader()) + { + return nullptr; + } + + // We have a phi def for the current loop. Look for a primary + // induction variable. + GenTreePhi* phi = data->AsPhi(); + GenTreePhiArg* enterSsa = nullptr; + GenTreePhiArg* backedgeSsa = nullptr; + + for (GenTreePhi::Use& use : phi->Uses()) + { + GenTreePhiArg* phiArg = use.GetNode()->AsPhiArg(); + GenTreePhiArg*& ssaArg = m_loop->ContainsBlock(phiArg->gtPredBB) ? backedgeSsa : enterSsa; + if ((ssaArg == nullptr) || (ssaArg->GetSsaNum() == phiArg->GetSsaNum())) + { + ssaArg = phiArg; + } + else + { + return nullptr; + } + } + + if ((enterSsa == nullptr) || (backedgeSsa == nullptr)) + { + return nullptr; + } + + ScevLocal* enterScev = NewLocal(enterSsa->GetLclNum(), enterSsa->GetSsaNum()); + + LclVarDsc* dsc = m_comp->lvaGetDesc(store); + LclSsaVarDsc* ssaDsc = dsc->GetPerSsaData(backedgeSsa->GetSsaNum()); + + if (ssaDsc->GetDefNode() == nullptr) + { + // GT_CALL retbuf def + return nullptr; + } + + if (ssaDsc->GetDefNode()->GetLclNum() != store->GetLclNum()) + { + assert(dsc->lvIsStructField && ssaDsc->GetDefNode()->GetLclNum() == dsc->lvParentLcl); + return nullptr; + } + + assert(ssaDsc->GetBlock() != nullptr); + + // We currently do not handle complicated addrecs. We can do this + // by inserting a symbolic node in the cache and analyzing while it + // is part of the cache. It would allow us to model + // + // int i = 0; + // while (i < n) + // { + // int j = i + 1; + // ... + // i = j; + // } + // => + // + // and chains of recurrences, such as + // + // int i = 0; + // int j = 0; + // while (i < n) + // { + // j++; + // i += j; + // } + // => > + // + // The main issue is that it requires cache invalidation afterwards + // and turning the recursive result into an addrec. + // + return CreateSimpleAddRec(store, enterScev, ssaDsc->GetBlock(), ssaDsc->GetDefNode()->Data()); + } + case GT_CAST: + { + GenTreeCast* cast = tree->AsCast(); + if (cast->gtCastType != TYP_LONG) + { + return nullptr; + } + + Scev* op = Analyze(block, cast->CastOp(), depth + 1); + if (op == nullptr) + { + return nullptr; + } + + return NewExtension(cast->IsUnsigned() ? ScevOper::ZeroExtend : ScevOper::SignExtend, TYP_LONG, op); + } + case GT_ADD: + case GT_MUL: + case GT_LSH: + { + Scev* op1 = Analyze(block, tree->gtGetOp1(), depth + 1); + if (op1 == nullptr) + return nullptr; + + Scev* op2 = Analyze(block, tree->gtGetOp2(), depth + 1); + if (op2 == nullptr) + return nullptr; + + ScevOper oper; + switch (tree->OperGet()) + { + case GT_ADD: + oper = ScevOper::Add; + break; + case GT_MUL: + oper = ScevOper::Mul; + break; + case GT_LSH: + oper = ScevOper::Lsh; + break; + default: + unreached(); + } + + return NewBinop(oper, op1, op2); + } + case GT_COMMA: + { + return Analyze(block, tree->gtGetOp2(), depth + 1); + } + case GT_ARR_ADDR: + { + return Analyze(block, tree->AsArrAddr()->Addr(), depth + 1); + } + default: + return nullptr; + } +} + +//------------------------------------------------------------------------ +// CreateSimpleAddRec: Create a "simple" add-recurrence. This handles the most +// common patterns for primary induction variables where we see a store like +// "i = i + 1". +// +// Parameters: +// headerStore - Phi definition of the candidate primary induction variable +// enterScev - SCEV describing start value of the primary induction variable +// stepDefBlock - Block containing the def of the step value +// stepDefData - Value of the def of the step value +// +// Returns: +// SCEV node if this is a simple addrec shape. Otherwise nullptr. +// +Scev* ScalarEvolutionContext::CreateSimpleAddRec(GenTreeLclVarCommon* headerStore, + ScevLocal* enterScev, + BasicBlock* stepDefBlock, + GenTree* stepDefData) +{ + if (!stepDefData->OperIs(GT_ADD)) + { + return nullptr; + } + + GenTree* stepTree; + GenTree* op1 = stepDefData->gtGetOp1(); + GenTree* op2 = stepDefData->gtGetOp2(); + if (op1->OperIs(GT_LCL_VAR) && (op1->AsLclVar()->GetLclNum() == headerStore->GetLclNum()) && + (op1->AsLclVar()->GetSsaNum() == headerStore->GetSsaNum())) + { + stepTree = op2; + } + else if (op2->OperIs(GT_LCL_VAR) && (op2->AsLclVar()->GetLclNum() == headerStore->GetLclNum()) && + (op2->AsLclVar()->GetSsaNum() == headerStore->GetSsaNum())) + { + stepTree = op1; + } + else + { + // Not a simple IV shape (i.e. more complex than "i = i + k") + return nullptr; + } + + Scev* stepScev = CreateSimpleInvariantScev(stepTree); + if (stepScev == nullptr) + { + return nullptr; + } + + return NewAddRec(enterScev, stepScev); +} + +//------------------------------------------------------------------------ +// Analyze: Analyze the specified tree in the specified block. +// +// Parameters: +// block - Block containing the tree +// tree - Tree node +// +// Returns: +// SCEV node if the tree was analyzable; otherwise nullptr if the value is +// cannot be described. +// +Scev* ScalarEvolutionContext::Analyze(BasicBlock* block, GenTree* tree) +{ + return Analyze(block, tree, 0); +} + +// Since the analysis follows SSA defs we have no upper bound on the potential +// depth of the analysis performed. We put an artificial limit on this for two +// reasons: +// 1. The analysis is recursive, and we should not stack overflow regardless of +// the input program. +// 2. If we produced arbitrarily deep SCEV trees then all algorithms over their +// structure would similarly be at risk of stack overflows if they were +// recursive. However, these algorithms are generally much more elegant when +// they make use of recursion. +const int SCALAR_EVOLUTION_ANALYSIS_MAX_DEPTH = 64; + +//------------------------------------------------------------------------ +// Analyze: Analyze the specified tree in the specified block. +// +// Parameters: +// block - Block containing the tree +// tree - Tree node +// depth - Current analysis depth +// +// Returns: +// SCEV node if the tree was analyzable; otherwise nullptr if the value is +// cannot be described. +// +Scev* ScalarEvolutionContext::Analyze(BasicBlock* block, GenTree* tree, int depth) +{ + Scev* result; + if (!m_cache.Lookup(tree, &result)) + { + if (depth >= SCALAR_EVOLUTION_ANALYSIS_MAX_DEPTH) + { + return nullptr; + } + + result = AnalyzeNew(block, tree, depth); + m_cache.Set(tree, result); + } + + return result; +} + +//------------------------------------------------------------------------ +// FoldBinop: Fold simple binops. +// +// Type parameters: +// T - Type that the binop is being evaluated in +// +// Parameters: +// oper - Binary operation +// op1 - First operand +// op2 - Second operand +// +// Returns: +// Folded value. +// +template +static T FoldBinop(ScevOper oper, T op1, T op2) +{ + switch (oper) + { + case ScevOper::Add: + return op1 + op2; + case ScevOper::Mul: + return op1 * op2; + case ScevOper::Lsh: + return op1 << op2; + default: + unreached(); + } +} + +//------------------------------------------------------------------------ +// Simplify: Try to simplify a SCEV node by folding and canonicalization. +// +// Parameters: +// scev - The node +// +// Returns: +// Simplified node. +// +// Remarks: +// Canonicalization is done for binops; constants are moved to the right and +// addrecs are moved to the left. +// +// Simple unops/binops on constants are folded. Operands are distributed into +// add recs whenever possible. +// +Scev* ScalarEvolutionContext::Simplify(Scev* scev) +{ + switch (scev->Oper) + { + case ScevOper::Constant: + case ScevOper::Local: + { + return scev; + } + case ScevOper::ZeroExtend: + case ScevOper::SignExtend: + { + ScevUnop* unop = (ScevUnop*)scev; + assert(genTypeSize(unop->Type) >= genTypeSize(unop->Op1->Type)); + + Scev* op1 = Simplify(unop->Op1); + + if (unop->Type == op1->Type) + { + return op1; + } + + assert((unop->Type == TYP_LONG) && (op1->Type == TYP_INT)); + + if (op1->OperIs(ScevOper::Constant)) + { + ScevConstant* cns = (ScevConstant*)op1; + return NewConstant(unop->Type, unop->OperIs(ScevOper::ZeroExtend) ? (uint64_t)(int32_t)cns->Value + : (int64_t)(int32_t)cns->Value); + } + + if (op1->OperIs(ScevOper::AddRec)) + { + // TODO-Cleanup: This requires some proof that it is ok, but + // currently we do not rely on this. + return op1; + } + + return (op1 == unop->Op1) ? unop : NewExtension(unop->Oper, unop->Type, op1); + } + case ScevOper::Add: + case ScevOper::Mul: + case ScevOper::Lsh: + { + ScevBinop* binop = (ScevBinop*)scev; + Scev* op1 = Simplify(binop->Op1); + Scev* op2 = Simplify(binop->Op2); + + if (binop->OperIs(ScevOper::Add, ScevOper::Mul)) + { + // Normalize addrecs to the left + if (op2->OperIs(ScevOper::AddRec) && !op1->OperIs(ScevOper::AddRec)) + { + std::swap(op1, op2); + } + // Normalize constants to the right + if (op1->OperIs(ScevOper::Constant) && !op2->OperIs(ScevOper::Constant)) + { + std::swap(op1, op2); + } + } + + if (op1->OperIs(ScevOper::AddRec)) + { + // + x => + // * x => + ScevAddRec* addRec = (ScevAddRec*)op1; + Scev* newStart = Simplify(NewBinop(binop->Oper, addRec->Start, op2)); + Scev* newStep = scev->OperIs(ScevOper::Mul, ScevOper::Lsh) + ? Simplify(NewBinop(binop->Oper, addRec->Step, op2)) + : addRec->Step; + return NewAddRec(newStart, newStep); + } + + if (op1->OperIs(ScevOper::Constant) && op2->OperIs(ScevOper::Constant)) + { + ScevConstant* cns1 = (ScevConstant*)op1; + ScevConstant* cns2 = (ScevConstant*)op2; + int64_t newValue; + if (binop->TypeIs(TYP_INT)) + { + newValue = FoldBinop(binop->Oper, static_cast(cns1->Value), + static_cast(cns2->Value)); + } + else + { + assert(binop->TypeIs(TYP_LONG)); + newValue = FoldBinop(binop->Oper, cns1->Value, cns2->Value); + } + + return NewConstant(binop->Type, newValue); + } + + return (op1 == binop->Op1) && (op2 == binop->Op2) ? binop : NewBinop(binop->Oper, op1, op2); + } + case ScevOper::AddRec: + { + ScevAddRec* addRec = (ScevAddRec*)scev; + Scev* start = Simplify(addRec->Start); + Scev* step = Simplify(addRec->Step); + return (start == addRec->Start) && (step == addRec->Step) ? addRec : NewAddRec(start, step); + } + default: + unreached(); + } +} diff --git a/src/coreclr/jit/scev.h b/src/coreclr/jit/scev.h new file mode 100644 index 00000000000000..603088d9623661 --- /dev/null +++ b/src/coreclr/jit/scev.h @@ -0,0 +1,155 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma once + +// This file contains the definition of the scalar evolution IR. This IR allows +// representing the values of IR nodes inside loops in a closed form, taking +// into account that they are changing on each loop iteration. The IR is based +// around the following possible operations. At the core is ScevOper::AddRec, +// which represents a value that evolves by an add recurrence. In dumps it is +// described by where "loop" is the loop the value is +// evolving in, "start" is the initial value and "step" is the step by which +// the value evolves in every iteration. +// +// See scev.cpp for further documentation. +// +enum class ScevOper +{ + Constant, + Local, + ZeroExtend, + SignExtend, + Add, + Mul, + Lsh, + AddRec, +}; + +static bool ScevOperIs(ScevOper oper, ScevOper otherOper) +{ + return oper == otherOper; +} + +template +static bool ScevOperIs(ScevOper oper, ScevOper operFirst, Args... operTail) +{ + return oper == operFirst || ScevOperIs(oper, operTail...); +} + +struct Scev +{ + const ScevOper Oper; + const var_types Type; + + Scev(ScevOper oper, var_types type) : Oper(oper), Type(type) + { + } + + template + bool OperIs(Args... opers) + { + return ScevOperIs(Oper, opers...); + } + + bool TypeIs(var_types type) + { + return Type == type; + } + + bool GetConstantValue(Compiler* comp, int64_t* cns); + +#ifdef DEBUG + void Dump(Compiler* comp); +#endif +}; + +struct ScevConstant : Scev +{ + ScevConstant(var_types type, int64_t value) : Scev(ScevOper::Constant, type), Value(value) + { + } + + int64_t Value; +}; + +struct ScevLocal : Scev +{ + ScevLocal(var_types type, unsigned lclNum, unsigned ssaNum) + : Scev(ScevOper::Local, type), LclNum(lclNum), SsaNum(ssaNum) + { + } + + const unsigned LclNum; + const unsigned SsaNum; + + bool GetConstantValue(Compiler* comp, int64_t* cns); +}; + +struct ScevUnop : Scev +{ + ScevUnop(ScevOper oper, var_types type, Scev* op1) : Scev(oper, type), Op1(op1) + { + } + + Scev* const Op1; +}; + +struct ScevBinop : ScevUnop +{ + ScevBinop(ScevOper oper, var_types type, Scev* op1, Scev* op2) : ScevUnop(oper, type, op1), Op2(op2) + { + } + + Scev* const Op2; +}; + +// Represents a value that evolves by an add recurrence. +// The value at iteration N is Start + N * Step. +// "Start" and "Step" are guaranteed to be invariant in "Loop". +struct ScevAddRec : Scev +{ + ScevAddRec(var_types type, Scev* start, Scev* step DEBUGARG(FlowGraphNaturalLoop* loop)) + : Scev(ScevOper::AddRec, type), Start(start), Step(step) DEBUGARG(Loop(loop)) + { + } + + Scev* const Start; + Scev* const Step; + INDEBUG(FlowGraphNaturalLoop* const Loop); +}; + +typedef JitHashTable, Scev*> ScalarEvolutionMap; + +// Scalar evolution is analyzed in the context of a single loop, and are +// computed on-demand by the use of the "Analyze" method on this class, which +// also maintains a cache. +class ScalarEvolutionContext +{ + Compiler* m_comp; + FlowGraphNaturalLoop* m_loop = nullptr; + ScalarEvolutionMap m_cache; + + Scev* Analyze(BasicBlock* block, GenTree* tree, int depth); + Scev* AnalyzeNew(BasicBlock* block, GenTree* tree, int depth); + Scev* CreateSimpleAddRec(GenTreeLclVarCommon* headerStore, + ScevLocal* start, + BasicBlock* stepDefBlock, + GenTree* stepDefData); + Scev* CreateSimpleInvariantScev(GenTree* tree); + Scev* CreateScevForConstant(GenTreeIntConCommon* tree); + +public: + ScalarEvolutionContext(Compiler* comp); + + void ResetForLoop(FlowGraphNaturalLoop* loop); + + ScevConstant* NewConstant(var_types type, int64_t value); + ScevLocal* NewLocal(unsigned lclNum, unsigned ssaNum); + ScevUnop* NewExtension(ScevOper oper, var_types targetType, Scev* op); + ScevBinop* NewBinop(ScevOper oper, Scev* op1, Scev* op2); + ScevAddRec* NewAddRec(Scev* start, Scev* step); + + Scev* Analyze(BasicBlock* block, GenTree* tree); + Scev* Simplify(Scev* scev); +};