Skip to content

Commit

Permalink
[SYCL] Disable vectorization and loop transformation passes (#2458)
Browse files Browse the repository at this point in the history
Loop unrolling in "SYCL optimization mode" uses default heuristic, which
is tuned for CPU and might not be profitable for other devices.
  • Loading branch information
bader authored May 12, 2021
1 parent 07b2796 commit ff6929e
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 136 deletions.
9 changes: 0 additions & 9 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6634,17 +6634,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_gnu_inline_asm, true))
CmdArgs.push_back("-fno-gnu-inline-asm");

bool EnableSYCLEarlyOptimizations =
Args.hasFlag(options::OPT_fsycl_early_optimizations,
options::OPT_fno_sycl_early_optimizations,
Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga);

// Enable vectorization per default according to the optimization level
// selected. For optimization levels that want vectorization we use the alias
// option to simplify the hasFlag logic.
bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false);
if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations)
EnableVec = false; // But disable vectorization for SYCL device code
OptSpecifier VectorizeAliasOption =
EnableVec ? options::OPT_O_Group : options::OPT_fvectorize;
if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption,
Expand All @@ -6653,8 +6646,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,

// -fslp-vectorize is enabled based on the optimization level selected.
bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
if (RawTriple.isSPIR() && EnableSYCLEarlyOptimizations)
EnableSLPVec = false; // But disable vectorization for SYCL device code
OptSpecifier SLPVectAliasOption =
EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,
Expand Down
10 changes: 0 additions & 10 deletions clang/test/Driver/sycl-device-optimizations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,3 @@
// RUN: | FileCheck -check-prefix=CHECK-DAE %s
// CHECK-DAE: clang{{.*}} "-fenable-sycl-dae"
// CHECK-DAE: sycl-post-link{{.*}} "-emit-param-info"

/// Check that vectorizers are disabled by default:
// RUN: %clang -### -fsycl %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHECK-VEC-DEFAULT %s
// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-loops"
// CHECK-VEC-DEFAULT-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-vectorize-slp"
/// Check that vectorizers can still be enabled manually:
// RUN: %clang -### -fsycl -fvectorize -fslp-vectorize %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHECK-VEC-ENABLE %s
// CHECK-VEC-ENABLE: clang{{.*}} "-fsycl-is-device"{{.*}}"-vectorize-loops"{{.*}}"-vectorize-slp"
242 changes: 125 additions & 117 deletions llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,54 +429,54 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
MPM.add(createReassociatePass()); // Reassociate expressions

// Begin the loop pass pipeline.
if (EnableSimpleLoopUnswitch) {
// The simple loop unswitch pass relies on separate cleanup passes. Schedule
// them first so when we re-process a loop they run before other loop
// passes.
MPM.add(createLoopInstSimplifyPass());
MPM.add(createLoopSimplifyCFGPass());
}
// Try to remove as much code from the loop header as possible,
// to reduce amount of IR that will have to be duplicated.
// TODO: Investigate promotion cap for O1.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
// Rotate Loop - disable header duplication at -Oz
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
// TODO: Investigate promotion cap for O1.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
if (EnableSimpleLoopUnswitch)
MPM.add(createSimpleLoopUnswitchLegacyPass());
else
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
// FIXME: We break the loop pass pipeline here in order to do full
// simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
// need for this.
MPM.add(createCFGSimplificationPass());
MPM.add(createInstructionCombiningPass());
// We resume loop passes creating a second loop pipeline here.
if (EnableLoopFlatten) {
MPM.add(createLoopFlattenPass()); // Flatten loops
MPM.add(createLoopSimplifyCFGPass());
// Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
// optimizations rely on TTI, which is not accurate for SPIR target.
if (!SYCLOptimizationMode) {
// Begin the loop pass pipeline.
if (EnableSimpleLoopUnswitch) {
// The simple loop unswitch pass relies on separate cleanup passes.
// Schedule them first so when we re-process a loop they run before other
// loop passes.
MPM.add(createLoopInstSimplifyPass());
MPM.add(createLoopSimplifyCFGPass());
}
// Try to remove as much code from the loop header as possible,
// to reduce amount of IR that will have to be duplicated.
// TODO: Investigate promotion cap for O1.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
// Rotate Loop - disable header duplication at -Oz
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
// TODO: Investigate promotion cap for O1.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
if (EnableSimpleLoopUnswitch)
MPM.add(createSimpleLoopUnswitchLegacyPass());
else
MPM.add(
createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
// FIXME: We break the loop pass pipeline here in order to do full
// simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace
// the need for this.
MPM.add(createCFGSimplificationPass());
MPM.add(createInstructionCombiningPass());
// We resume loop passes creating a second loop pipeline here.
if (EnableLoopFlatten) {
MPM.add(createLoopFlattenPass()); // Flatten loops
MPM.add(createLoopSimplifyCFGPass());
}
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
addExtensionsToPM(EP_LateLoopOptimizations, MPM);
MPM.add(createLoopDeletionPass()); // Delete dead loops

if (EnableLoopInterchange)
MPM.add(createLoopInterchangePass()); // Interchange loops

// Unroll small loops and perform peeling.
MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
// This ends the loop pass pipelines.
}
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
// TODO: this pass hurts performance due to promotions of induction variables
// from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual
// target with unlimited # of registers and pass doesn't take into account
// that on real HW this promotion is not beneficial.
if (!SYCLOptimizationMode)
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
addExtensionsToPM(EP_LateLoopOptimizations, MPM);
MPM.add(createLoopDeletionPass()); // Delete dead loops

if (EnableLoopInterchange)
MPM.add(createLoopInterchangePass()); // Interchange loops

// Unroll small loops and perform peeling.
MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
// This ends the loop pass pipelines.

// Break up allocas that may now be splittable after loop unrolling.
MPM.add(createSROAPass());
Expand Down Expand Up @@ -788,68 +788,74 @@ void PassManagerBuilder::populateModulePassManager(

addExtensionsToPM(EP_VectorizerStart, MPM);

// Re-rotate loops in all our loop nests. These may have fallout out of
// rotated form due to GVN or other transformations, and the vectorizer relies
// on the rotated form. Disable header duplication at -Oz.
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));

// Distribute loops to allow partial vectorization. I.e. isolate dependences
// into separate loop that would otherwise inhibit vectorization. This is
// currently only performed for loops marked with the metadata
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
MPM.add(createLoopDistributePass());

MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));

// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
MPM.add(createLoopLoadEliminationPass());

// FIXME: Because of #pragma vectorize enable, the passes below are always
// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
// on -O1 and no #pragma is found). Would be good to have these two passes
// as function calls, so that we can only pass them when the vectorizer
// changed the code.
MPM.add(createInstructionCombiningPass());
if (OptLevel > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correllated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
MPM.add(createEarlyCSEPass());
MPM.add(createCorrelatedValuePropagationPass());
MPM.add(createInstructionCombiningPass());
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
MPM.add(createCFGSimplificationPass());
if (!SYCLOptimizationMode) {
// Re-rotate loops in all our loop nests. These may have fallout out of
// rotated form due to GVN or other transformations, and the vectorizer
// relies on the rotated form. Disable header duplication at -Oz.
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));

// Distribute loops to allow partial vectorization. I.e. isolate
// dependences into separate loop that would otherwise inhibit
// vectorization. This is currently only performed for loops marked with
// the metadata llvm.loop.distribute=true or when -enable-loop-distribute is
// specified.
MPM.add(createLoopDistributePass());

MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));

// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
MPM.add(createLoopLoadEliminationPass());

// FIXME: Because of #pragma vectorize enable, the passes below are always
// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
// on -O1 and no #pragma is found). Would be good to have these two passes
// as function calls, so that we can only pass them when the vectorizer
// changed the code.
MPM.add(createInstructionCombiningPass());
}

// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
// FIXME: study whether hoisting and/or sinking of common instructions should
// be delayed until after SLP vectorizer.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));

if (SLPVectorize) {
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
if (OptLevel > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track
// correllated runtime checks for two inner loops in the same outer loop,
// fold any common computations, hoist loop-invariant aspects out of any
// outer loop, and unswitch the runtime checks if possible. Once hoisted,
// we may have dead (or speculatable) control flows or more combining
// opportunities.
MPM.add(createEarlyCSEPass());
MPM.add(createCorrelatedValuePropagationPass());
MPM.add(createInstructionCombiningPass());
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
MPM.add(
createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
MPM.add(createCFGSimplificationPass());
MPM.add(createInstructionCombiningPass());
}
}

// Enhance/cleanup vector code.
MPM.add(createVectorCombinePass());
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
// FIXME: study whether hoisting and/or sinking of common instructions
// should
// be delayed until after SLP vectorizer.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));

if (SLPVectorize) {
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
if (OptLevel > 1 && ExtraVectorizerPasses) {
MPM.add(createEarlyCSEPass());
}
}

// Enhance/cleanup vector code.
MPM.add(createVectorCombinePass());
}

addExtensionsToPM(EP_Peephole, MPM);
MPM.add(createInstructionCombiningPass());
Expand All @@ -861,22 +867,24 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createLoopUnrollAndJamPass(OptLevel));
}

// Unroll small loops
MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
if (!SYCLOptimizationMode) {
// Unroll small loops
MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));

if (!DisableUnrollLoops) {
// LoopUnroll may generate some redundency to cleanup.
MPM.add(createInstructionCombiningPass());
if (!DisableUnrollLoops) {
// LoopUnroll may generate some redundency to cleanup.
MPM.add(createInstructionCombiningPass());

// Runtime unrolling will introduce runtime check in loop prologue. If the
// unrolled loop is a inner loop, then the prologue will be inside the
// outer loop. LICM pass can help to promote the runtime check out if the
// checked value is loop invariant.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
}
// Runtime unrolling will introduce runtime check in loop prologue. If the
// unrolled loop is a inner loop, then the prologue will be inside the
// outer loop. LICM pass can help to promote the runtime check out if the
// checked value is loop invariant.
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
}

MPM.add(createWarnMissedTransformationsPass());
MPM.add(createWarnMissedTransformationsPass());
}

// After vectorization and unrolling, assume intrinsics may tell us more
// about pointer alignments.
Expand Down

0 comments on commit ff6929e

Please sign in to comment.